From 548fe51740d0f3294e548f654c099e5aefbf4cb7 Mon Sep 17 00:00:00 2001 From: Madhav Bhatt Date: Thu, 17 Apr 2025 02:45:43 -0700 Subject: firmware: xilinx: Add debugfs support for PM_GET_NODE_STATUS Add new debug interface to support PM_GET_NODE_STATUS to get the node information like requirements and usage. The debugfs firmware driver interface is only meant for testing and debugging EEMI APIs. Hence, it is by-default disabled in production systems. Signed-off-by: Madhav Bhatt Link: https://lore.kernel.org/r/20250417094543.3873507-1-madhav.bhatt@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..4699f50465f2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,7 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx - * Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. + * Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. * * Michal Simek * Davorin Mista @@ -164,6 +164,7 @@ enum pm_api_cb_id { enum pm_api_id { PM_API_FEATURES = 0, PM_GET_API_VERSION = 1, + PM_GET_NODE_STATUS = 3, PM_REGISTER_NOTIFIER = 5, PM_FORCE_POWERDOWN = 8, PM_REQUEST_WAKEUP = 10, @@ -629,6 +630,8 @@ int zynqmp_pm_request_wake(const u32 node, int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode); int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode); int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode); +int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, u32 *const usage); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -931,6 +934,13 @@ static inline int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mo return -ENODEV; } +static inline int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, + u32 *const usage) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) -- cgit v1.2.3 From e66f4c35e375346943bfe2a0990e97253f74440f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:50 -0700 Subject: drivers: firmware: xilinx: Add unique family code for all platforms The family code is currently derived from the PMC_TAP_IDCODE register value, but there are issues where Versal, Versal NET, and future platforms share the same family code. Additionally for some platforms have identical subfamily code, making it challenging to differentiate between platforms based on the family and subfamily codes. To resolve this, a new family code member is added to the platform data, initialized with unique values. This change enables better platform distinction via the compatible string. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-3-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 32 +++++++++++++++++++++++++++++--- include/linux/firmware/xlnx-zynqmp.h | 5 +++++ 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 53b8c88b4178..2d250a16d3dd 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -72,6 +72,15 @@ struct pm_api_feature_data { struct hlist_node hentry; }; +struct platform_fw_data { + /* + * Family code for platform. + */ + const u32 family_code; +}; + +static struct platform_fw_data *active_platform_fw_data; + static const struct mfd_cell firmware_devs[] = { { .name = "zynqmp_power_controller", @@ -2052,6 +2061,11 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) if (ret) return ret; + /* Get platform-specific firmware data from device tree match */ + active_platform_fw_data = (struct platform_fw_data *)device_get_match_data(dev); + if (!active_platform_fw_data) + return -EINVAL; + /* Get SiP SVC version number */ ret = zynqmp_pm_get_sip_svc_version(&sip_svc_version); if (ret) @@ -2152,10 +2166,22 @@ static void zynqmp_firmware_sync_state(struct device *dev) dev_warn(dev, "failed to release power management to firmware\n"); } +static const struct platform_fw_data platform_fw_data_versal = { + .family_code = PM_VERSAL_FAMILY_CODE, +}; + +static const struct platform_fw_data platform_fw_data_versal_net = { + .family_code = PM_VERSAL_NET_FAMILY_CODE, +}; + +static const struct platform_fw_data platform_fw_data_zynqmp = { + .family_code = PM_ZYNQMP_FAMILY_CODE, +}; + static const struct of_device_id zynqmp_firmware_of_match[] = { - {.compatible = "xlnx,zynqmp-firmware"}, - {.compatible = "xlnx,versal-firmware"}, - {.compatible = "xlnx,versal-net-firmware"}, + {.compatible = "xlnx,zynqmp-firmware", .data = &platform_fw_data_zynqmp}, + {.compatible = "xlnx,versal-firmware", .data = &platform_fw_data_versal}, + {.compatible = "xlnx,versal-net-firmware", .data = &platform_fw_data_versal_net}, {}, }; MODULE_DEVICE_TABLE(of, zynqmp_firmware_of_match); diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 4699f50465f2..6458ef4e04e2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -54,6 +54,11 @@ #define ZYNQMP_FAMILY_CODE 0x23 #define VERSAL_FAMILY_CODE 0x26 +/* Family codes */ +#define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ +#define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ +#define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ + /* When all subfamily of platform need to support */ #define ALL_SUB_FAMILY_CODE 0x00 #define VERSAL_SUB_FAMILY_CODE 0x01 -- cgit v1.2.3 From 25e3ae0ce364fa725a6eea68d63d6a2ee09e019f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:51 -0700 Subject: drivers: firmware: xilinx: Switch to new family code in zynqmp_pm_get_family_info() Currently, the family code and subfamily code are derived from the PMC_TAP_IDCODE register. Versal, Versal NET share the same family code. Also some platforms share the same subfamily code, making it difficult to distinguish between platforms. Update zynqmp_pm_get_family_info() to use IDs derived from the compatible string instead of silicon ID codes derived from PMC_TAP_IDCODE register. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-4-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 42 +++++++++++++-------------------- drivers/pinctrl/pinctrl-zynqmp.c | 7 +++--- drivers/soc/xilinx/xlnx_event_manager.c | 8 +++---- drivers/soc/xilinx/zynqmp_power.c | 10 ++++---- include/linux/firmware/xlnx-zynqmp.h | 15 ++---------- 5 files changed, 31 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 2d250a16d3dd..835a50c5af46 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -473,8 +473,6 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...) static u32 pm_api_version; static u32 pm_tz_version; -static u32 pm_family_code; -static u32 pm_sub_family_code; int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset) { @@ -541,32 +539,18 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_get_chipid); /** * zynqmp_pm_get_family_info() - Get family info of platform * @family: Returned family code value - * @subfamily: Returned sub-family code value * * Return: Returns status, either success or error+reason */ -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +int zynqmp_pm_get_family_info(u32 *family) { - u32 ret_payload[PAYLOAD_ARG_CNT]; - u32 idcode; - int ret; - - /* Check is family or sub-family code already received */ - if (pm_family_code && pm_sub_family_code) { - *family = pm_family_code; - *subfamily = pm_sub_family_code; - return 0; - } + if (!active_platform_fw_data) + return -ENODEV; - ret = zynqmp_pm_invoke_fn(PM_GET_CHIPID, ret_payload, 0); - if (ret < 0) - return ret; + if (!family) + return -EINVAL; - idcode = ret_payload[1]; - pm_family_code = FIELD_GET(FAMILY_CODE_MASK, idcode); - pm_sub_family_code = FIELD_GET(SUB_FAMILY_CODE_MASK, idcode); - *family = pm_family_code; - *subfamily = pm_sub_family_code; + *family = active_platform_fw_data->family_code; return 0; } @@ -1247,8 +1231,13 @@ int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param, u32 value) { int ret; + u32 pm_family_code; + + ret = zynqmp_pm_get_family_info(&pm_family_code); + if (ret) + return ret; - if (pm_family_code == ZYNQMP_FAMILY_CODE && + if (pm_family_code == PM_ZYNQMP_FAMILY_CODE && param == PM_PINCTRL_CONFIG_TRI_STATE) { ret = zynqmp_pm_feature(PM_PINCTRL_CONFIG_PARAM_SET); if (ret < PM_PINCTRL_PARAM_SET_VERSION) { @@ -2055,6 +2044,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct zynqmp_devinfo *devinfo; + u32 pm_family_code; int ret; ret = get_set_conduit_method(dev->of_node); @@ -2098,8 +2088,8 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) pr_info("%s Platform Management API v%d.%d\n", __func__, pm_api_version >> 16, pm_api_version & 0xFFFF); - /* Get the Family code and sub family code of platform */ - ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + /* Get the Family code of platform */ + ret = zynqmp_pm_get_family_info(&pm_family_code); if (ret < 0) return ret; @@ -2126,7 +2116,7 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) zynqmp_pm_api_debugfs_init(); - if (pm_family_code == VERSAL_FAMILY_CODE) { + if (pm_family_code != PM_ZYNQMP_FAMILY_CODE) { em_dev = platform_device_register_data(&pdev->dev, "xlnx_event_manager", -1, NULL, 0); if (IS_ERR(em_dev)) diff --git a/drivers/pinctrl/pinctrl-zynqmp.c b/drivers/pinctrl/pinctrl-zynqmp.c index fddf0fef4b13..71eaac81deb1 100644 --- a/drivers/pinctrl/pinctrl-zynqmp.c +++ b/drivers/pinctrl/pinctrl-zynqmp.c @@ -100,7 +100,6 @@ struct zynqmp_pctrl_group { static struct pinctrl_desc zynqmp_desc; static u32 family_code; -static u32 sub_family_code; static int zynqmp_pctrl_get_groups_count(struct pinctrl_dev *pctldev) { @@ -605,7 +604,7 @@ static int zynqmp_pinctrl_prepare_func_groups(struct device *dev, u32 fid, return -ENOMEM; for (pin = 0; pin < groups[resp[i]].npins; pin++) { - if (family_code == ZYNQMP_FAMILY_CODE) + if (family_code == PM_ZYNQMP_FAMILY_CODE) __set_bit(groups[resp[i]].pins[pin], used_pins); else __set_bit((u8)groups[resp[i]].pins[pin] - 1, used_pins); @@ -958,11 +957,11 @@ static int zynqmp_pinctrl_probe(struct platform_device *pdev) if (!pctrl) return -ENOMEM; - ret = zynqmp_pm_get_family_info(&family_code, &sub_family_code); + ret = zynqmp_pm_get_family_info(&family_code); if (ret < 0) return ret; - if (family_code == ZYNQMP_FAMILY_CODE) { + if (family_code == PM_ZYNQMP_FAMILY_CODE) { ret = zynqmp_pinctrl_prepare_pin_desc(&pdev->dev, &zynqmp_desc.pins, &zynqmp_desc.npins); } else { diff --git a/drivers/soc/xilinx/xlnx_event_manager.c b/drivers/soc/xilinx/xlnx_event_manager.c index a572d15f6161..6fdf4d14b7e7 100644 --- a/drivers/soc/xilinx/xlnx_event_manager.c +++ b/drivers/soc/xilinx/xlnx_event_manager.c @@ -77,17 +77,17 @@ struct registered_event_data { static bool xlnx_is_error_event(const u32 node_id) { - u32 pm_family_code, pm_sub_family_code; + u32 pm_family_code; - zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + zynqmp_pm_get_family_info(&pm_family_code); - if (pm_sub_family_code == VERSAL_SUB_FAMILY_CODE) { + if (pm_family_code == PM_VERSAL_FAMILY_CODE) { if (node_id == VERSAL_EVENT_ERROR_PMC_ERR1 || node_id == VERSAL_EVENT_ERROR_PMC_ERR2 || node_id == VERSAL_EVENT_ERROR_PSM_ERR1 || node_id == VERSAL_EVENT_ERROR_PSM_ERR2) return true; - } else { + } else if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE) { if (node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR1 || node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR2 || node_id == VERSAL_NET_EVENT_ERROR_PMC_ERR3 || diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index ae59bf16659a..9b7b2858b22a 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -285,7 +285,7 @@ static int register_event(struct device *dev, const enum pm_api_cb_id cb_type, c static int zynqmp_pm_probe(struct platform_device *pdev) { int ret, irq; - u32 pm_api_version, pm_family_code, pm_sub_family_code, node_id; + u32 pm_api_version, pm_family_code, node_id; struct mbox_client *client; ret = zynqmp_pm_get_api_version(&pm_api_version); @@ -315,14 +315,16 @@ static int zynqmp_pm_probe(struct platform_device *pdev) INIT_WORK(&zynqmp_pm_init_suspend_work->callback_work, zynqmp_pm_init_suspend_work_fn); - ret = zynqmp_pm_get_family_info(&pm_family_code, &pm_sub_family_code); + ret = zynqmp_pm_get_family_info(&pm_family_code); if (ret < 0) return ret; - if (pm_sub_family_code == VERSALNET_SUB_FAMILY_CODE) + if (pm_family_code == PM_VERSAL_NET_FAMILY_CODE) node_id = PM_DEV_ACPU_0_0; - else + else if (pm_family_code == PM_VERSAL_FAMILY_CODE) node_id = PM_DEV_ACPU_0; + else + return -ENODEV; ret = register_event(&pdev->dev, PM_NOTIFY_CB, node_id, EVENT_SUBSYSTEM_RESTART, false, subsystem_restart_event_callback); diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 6458ef4e04e2..be6817ac5120 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -51,22 +51,11 @@ #define PM_PINCTRL_PARAM_SET_VERSION 2 -#define ZYNQMP_FAMILY_CODE 0x23 -#define VERSAL_FAMILY_CODE 0x26 - /* Family codes */ #define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ #define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ #define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ -/* When all subfamily of platform need to support */ -#define ALL_SUB_FAMILY_CODE 0x00 -#define VERSAL_SUB_FAMILY_CODE 0x01 -#define VERSALNET_SUB_FAMILY_CODE 0x03 - -#define FAMILY_CODE_MASK GENMASK(27, 21) -#define SUB_FAMILY_CODE_MASK GENMASK(20, 19) - #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) #define PLM_MODULE_ID_MASK GENMASK(15, 8) @@ -570,7 +559,7 @@ int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily); +int zynqmp_pm_get_family_info(u32 *family); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); @@ -651,7 +640,7 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return -ENODEV; } -static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +static inline int zynqmp_pm_get_family_info(u32 *family) { return -ENODEV; } -- cgit v1.2.3 From ed4a5c5de56ad4e23c9e5da8981639352b63b8ac Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 23 Sep 2025 18:16:07 +0000 Subject: usb: typec: class: add typec_get_data_role symbol Alt Mode drivers are responsible for sending Enter Mode through the TCPM, but only a DFP is allowed to send Enter Mode. typec_get_data_role gets the port's data role, which can then be used in altmode drivers via typec_altmode_get_data_role to know if Enter Mode should be sent. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250923181606.1583584-5-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 13 +++++++++++++ include/linux/usb/typec.h | 1 + include/linux/usb/typec_altmode.h | 13 +++++++++++++ 3 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 67a533e35150..9b2647cb199b 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -2120,6 +2120,19 @@ void typec_set_data_role(struct typec_port *port, enum typec_data_role role) } EXPORT_SYMBOL_GPL(typec_set_data_role); +/** + * typec_get_data_role - Get port data role + * @port: The USB Type-C Port to query + * + * This routine is used by the altmode drivers to determine if the port is the + * DFP before issuing Enter Mode + */ +enum typec_data_role typec_get_data_role(struct typec_port *port) +{ + return port->data_role; +} +EXPORT_SYMBOL_GPL(typec_get_data_role); + /** * typec_set_pwr_role - Report power role change * @port: The USB Type-C Port where the role was changed diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 252af3f77039..309251572e2e 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -337,6 +337,7 @@ struct typec_plug *typec_register_plug(struct typec_cable *cable, void typec_unregister_plug(struct typec_plug *plug); void typec_set_data_role(struct typec_port *port, enum typec_data_role role); +enum typec_data_role typec_get_data_role(struct typec_port *port); void typec_set_pwr_role(struct typec_port *port, enum typec_role role); void typec_set_vconn_role(struct typec_port *port, enum typec_role role); void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode); diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index b3c0866ea70f..f7db3bd4c90e 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -172,6 +172,19 @@ typec_altmode_get_svdm_version(struct typec_altmode *altmode) return typec_get_negotiated_svdm_version(typec_altmode2port(altmode)); } +/** + * typec_altmode_get_data_role - Get port data role + * @altmode: Handle to the alternate mode + * + * Alt Mode drivers should only issue Enter Mode through the port if they are + * the DFP. + */ +static inline enum typec_data_role +typec_altmode_get_data_role(struct typec_altmode *altmode) +{ + return typec_get_data_role(typec_altmode2port(altmode)); +} + /** * struct typec_altmode_driver - USB Type-C alternate mode device driver * @id_table: Null terminated array of SVIDs -- cgit v1.2.3 From 536bf30d282a6b2f676c6106587f0e1946449aca Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:53 -0500 Subject: iio: buffer: document iio_push_to_buffers_with_ts() Document the iio_push_to_buffers_with_ts() function. This is copied and slightly cleaned up from iio_push_to_buffers_with_timestamp(). Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 5c84ec4a9810..e46b818981aa 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -45,6 +45,22 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, return iio_push_to_buffers(indio_dev, data); } +/** + * iio_push_to_buffers_with_ts() - push data and timestamp to buffers + * @indio_dev: iio_dev structure for device. + * @data: Pointer to sample data buffer. + * @data_total_len: The size of @data in bytes. + * @timestamp: Timestamp for the sample data. + * + * Pushes data to the IIO device's buffers. If timestamps are enabled for the + * device the function will store the supplied timestamp as the last element in + * the sample data buffer before pushing it to the device buffers. The sample + * data buffer needs to be large enough to hold the additional timestamp + * (usually the buffer should be at least indio->scan_bytes bytes large). + * + * Context: Any context. + * Return: 0 on success, a negative error code otherwise. + */ static inline int iio_push_to_buffers_with_ts(struct iio_dev *indio_dev, void *data, size_t data_total_len, s64 timestamp) -- cgit v1.2.3 From 4992ce003b76ee1629ad4e7332a49ea2619e7523 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:54 -0500 Subject: iio: buffer: deprecated iio_push_to_buffers_with_timestamp() Replace the documentation of iio_push_to_buffers_with_timestamp() with a deprecation notice pointing to the preferred alternative. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index e46b818981aa..d37f82678f71 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -26,11 +26,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); * @data: sample data * @timestamp: timestamp for the sample data * - * Pushes data to the IIO device's buffers. If timestamps are enabled for the - * device the function will store the supplied timestamp as the last element in - * the sample data buffer before pushing it to the device buffers. The sample - * data buffer needs to be large enough to hold the additional timestamp - * (usually the buffer should be indio->scan_bytes bytes large). + * DEPRECATED: Use iio_push_to_buffers_with_ts() instead. * * Returns 0 on success, a negative error code otherwise. */ -- cgit v1.2.3 From 748ed9fc8596015e7e136877465919b89c7d08d6 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:56 -0500 Subject: iio: buffer: document store_to() callback may be called in any context Document that the struct iio_buffer_access_funcs.store_to() callback must be safe to call from any context since it is called from iio_push_to_buffer() which may be called from any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..0daff9ff20ce 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -24,7 +24,8 @@ struct sg_table; /** * struct iio_buffer_access_funcs - access functions for buffers. - * @store_to: actually store stuff to the buffer + * @store_to: actually store stuff to the buffer - must be safe to + * call from any context (e.g. must not sleep). * @read: try to get a specified number of bytes (must exist) * @data_available: indicates how much data is available for reading from * the buffer. -- cgit v1.2.3 From 592ae0ccecfac9af8f67444cab11cbb11770f571 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:57 -0500 Subject: iio: buffer: document that buffer callback must be context safe Document that the callback registered with iio_channel_get_all_cb() must be safe to call from any context since it is called from by iio_push_to_buffer() which can be called in any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- drivers/iio/buffer/industrialio-buffer-cb.c | 1 + include/linux/iio/consumer.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iio/buffer/industrialio-buffer-cb.c b/drivers/iio/buffer/industrialio-buffer-cb.c index 3e27385069ed..f4ebff968493 100644 --- a/drivers/iio/buffer/industrialio-buffer-cb.c +++ b/drivers/iio/buffer/industrialio-buffer-cb.c @@ -13,6 +13,7 @@ struct iio_cb_buffer { struct iio_buffer buffer; + /* Must be safe to call from any context (e.g. must not sleep). */ int (*cb)(const void *data, void *private); void *private; struct iio_channel *channels; diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index a38b277c2c02..5039558267e4 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -131,7 +131,8 @@ struct iio_cb_buffer; /** * iio_channel_get_all_cb() - register callback for triggered capture * @dev: Pointer to client device. - * @cb: Callback function. + * @cb: Callback function. Must be safe to call from any context + * (e.g. must not sleep). * @private: Private data passed to callback. * * NB right now we have no ability to mux data from multiple devices. -- cgit v1.2.3 From 8b9cd112f1ac8d72244b189654e693012ea8dfe0 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Thu, 9 Oct 2025 10:31:27 +0100 Subject: soc: samsung: gs101-pmu: implement access tables for read and write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing non-existent PMU registers causes an SError, halting the system. Implement read and write access tables for the gs101-PMU to specify which registers are read- and/or writable to avoid that SError. Reviewed-by: Sam Protsenko Signed-off-by: André Draszik Link: https://patch.msgid.link/20251009-gs101-pmu-regmap-tables-v2-3-2d64f5261952@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/soc/samsung/gs101-pmu.c | 306 ++++++++++++++++++++++++- include/linux/soc/samsung/exynos-regs-pmu.h | 343 +++++++++++++++++++++++++++- 2 files changed, 640 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/samsung/gs101-pmu.c b/drivers/soc/samsung/gs101-pmu.c index ec345d09f21f..17dadc1b9c6e 100644 --- a/drivers/soc/samsung/gs101-pmu.c +++ b/drivers/soc/samsung/gs101-pmu.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "exynos-pmu.h" @@ -20,9 +21,312 @@ #define TENSOR_PMUREG_WRITE 1 #define TENSOR_PMUREG_RMW 2 +static const struct regmap_range gs101_pmu_registers[] = { + regmap_reg_range(GS101_OM_STAT, GS101_SYSTEM_INFO), + regmap_reg_range(GS101_IDLE_IP(0), GS101_IDLE_IP_MASK(3)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0), + GS101_PPMPURAM_INFORM_SCL_CH(3)), + regmap_reg_range(GS101_INFORM0, GS101_SYSIP_DAT(0)), + /* skip SYSIP_DAT1 SYSIP_DAT2 */ + regmap_reg_range(GS101_SYSIP_DAT(3), GS101_PWR_HOLD_SW_TRIP), + regmap_reg_range(GS101_GSA_INFORM(0), GS101_GSA_INFORM(1)), + regmap_reg_range(GS101_INFORM4, GS101_IROM_INFORM), + regmap_reg_range(GS101_IROM_CPU_INFORM(0), GS101_IROM_CPU_INFORM(7)), + regmap_reg_range(GS101_PMU_SPARE(0), GS101_PMU_SPARE(3)), + /* skip most IROM_xxx registers */ + regmap_reg_range(GS101_DREX_CALIBRATION(0), GS101_DREX_CALIBRATION(7)), + +#define CLUSTER_CPU_RANGE(cl, cpu) \ + regmap_reg_range(GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu), \ + GS101_CLUSTER_CPU_OPTION(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_OUT(cl, cpu), \ + GS101_CLUSTER_CPU_IN(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu), \ + GS101_CLUSTER_CPU_INT_DIR(cl, cpu)) + + /* cluster 0..2 and cpu 0..4 or 0..1 */ + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1), +#undef CLUSTER_CPU_RANGE + +#define CLUSTER_NONCPU_RANGE(cl) \ + regmap_reg_range(GS101_CLUSTER_NONCPU_CONFIGURATION(cl), \ + GS101_CLUSTER_NONCPU_OPTION(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_OUT(cl), \ + GS101_CLUSTER_NONCPU_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl), \ + GS101_CLUSTER_NONCPU_INT_DIR(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl)) + + CLUSTER_NONCPU_RANGE(0), + regmap_reg_range(GS101_CLUSTER0_NONCPU_DSU_PCH, + GS101_CLUSTER0_NONCPU_DSU_PCH), + CLUSTER_NONCPU_RANGE(1), + CLUSTER_NONCPU_RANGE(2), +#undef CLUSTER_NONCPU_RANGE + +#define SUBBLK_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CONFIGURATION(blk), \ + GS101_SUBBLK_CTRL(blk)), \ + regmap_reg_range(GS101_SUBBLK_OUT(blk), GS101_SUBBLK_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_INT_IN(blk), \ + GS101_SUBBLK_INT_DIR(blk)), \ + regmap_reg_range(GS101_SUBBLK_MEMORY_OUT(blk), \ + GS101_SUBBLK_MEMORY_IN(blk)) + + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D), +#undef SUBBLK_RANGE + +#define SUBBLK_CPU_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CPU_CONFIGURATION(blk), \ + GS101_SUBBLK_CPU_OPTION(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_OUT(blk), \ + GS101_SUBBLK_CPU_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk), \ + GS101_SUBBLK_CPU_INT_DIR(blk)) + + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS), +#undef SUBBLK_CPU_RANGE + + regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CTRL), + regmap_reg_range(GS101_MIF_OUT, GS101_MIF_IN), + regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_DIR), + regmap_reg_range(GS101_TOP_CONFIGURATION, GS101_TOP_OPTION), + regmap_reg_range(GS101_TOP_OUT, GS101_TOP_IN), + regmap_reg_range(GS101_TOP_INT_IN, GS101_WAKEUP2_STAT), + regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_DIR), + regmap_reg_range(GS101_SYSTEM_CONFIGURATION, GS101_USER_DEFINED_OUT), + regmap_reg_range(GS101_SYSTEM_OUT, GS101_SYSTEM_IN), + regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_EINT_WAKEUP_MASK3), + regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_SCAN2DRAM_INT_DIR), + /* skip HCU_START */ + regmap_reg_range(GS101_CUSTOM_OUT, GS101_CUSTOM_IN), + regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_DIR), + regmap_reg_range(GS101_ACK_LAST_CPU, GS101_HCU_R(3)), + regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC), + /* skip PMU_RAM_CTRL */ + regmap_reg_range(GS101_APM_HCU_CTRL, GS101_APM_HCU_CTRL), + regmap_reg_range(GS101_APM_NMI_ENABLE, GS101_RST_STAT_PMU), + regmap_reg_range(GS101_HPM_INT_IN, GS101_BOOT_STAT), + regmap_reg_range(GS101_PMLINK_OUT, GS101_PMLINK_AOC_CTRL), + regmap_reg_range(GS101_TCXO_BUF_CTRL, GS101_ADD_CTRL), + regmap_reg_range(GS101_HCU_TIMEOUT_RESET, GS101_HCU_TIMEOUT_SCAN2DRAM), + regmap_reg_range(GS101_TIMER(0), GS101_TIMER(3)), + regmap_reg_range(GS101_PPC_MIF(0), GS101_PPC_EH), + /* PPC_OFFSET, skip PPC_CPUCL1_0 PPC_CPUCL1_1 */ + regmap_reg_range(GS101_EXT_REGULATOR_MIF_DURATION, GS101_TCXO_DURATION), + regmap_reg_range(GS101_BURNIN_CTRL, GS101_TMU_SUB_TRIP), + regmap_reg_range(GS101_MEMORY_CEN, GS101_MEMORY_SMX_FEEDBACK), + regmap_reg_range(GS101_SLC_PCH_CHANNEL, GS101_SLC_PCH_CB), + regmap_reg_range(GS101_FORCE_NOMC, GS101_FORCE_NOMC), + regmap_reg_range(GS101_FORCE_BOOST, GS101_PMLINK_SLC_BUSY), + regmap_reg_range(GS101_BOOTSYNC_OUT, GS101_CTRL_SECJTAG_ALIVE), + regmap_reg_range(GS101_CTRL_DIV_PLL_ALV_DIVLOW, GS101_CTRL_CLKDIV__CLKRTC), + regmap_reg_range(GS101_CTRL_SOC32K, GS101_CTRL_SBU_SW_EN), + regmap_reg_range(GS101_PAD_CTRL_CLKOUT0, GS101_PAD_CTRL_WRESETO_n), + regmap_reg_range(GS101_PHY_CTRL_USB20, GS101_PHY_CTRL_UFS), +}; + +static const struct regmap_range gs101_pmu_ro_registers[] = { + regmap_reg_range(GS101_OM_STAT, GS101_VERSION), + regmap_reg_range(GS101_OTP_STATUS, GS101_OTP_STATUS), + + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(0), + GS101_PPMPURAM_STATE_SLC_CH(0)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(1), + GS101_PPMPURAM_STATE_SLC_CH(1)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(2), + GS101_PPMPURAM_STATE_SLC_CH(2)), + regmap_reg_range(GS101_DATARAM_STATE_SLC_CH(3), + GS101_PPMPURAM_STATE_SLC_CH(3)), + +#define CLUSTER_CPU_RANGE(cl, cpu) \ + regmap_reg_range(GS101_CLUSTER_CPU_IN(cl, cpu), \ + GS101_CLUSTER_CPU_IN(cl, cpu)), \ + regmap_reg_range(GS101_CLUSTER_CPU_INT_IN(cl, cpu), \ + GS101_CLUSTER_CPU_INT_IN(cl, cpu)) + + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 2), + CLUSTER_CPU_RANGE(GS101_CLUSTER0_OFFSET, 3), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER1_OFFSET, 1), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 0), + CLUSTER_CPU_RANGE(GS101_CLUSTER2_OFFSET, 1), +#undef CLUSTER_CPU_RANGE + +#define CLUSTER_NONCPU_RANGE(cl) \ + regmap_reg_range(GS101_CLUSTER_NONCPU_IN(cl), \ + GS101_CLUSTER_NONCPU_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_IN(cl), \ + GS101_CLUSTER_NONCPU_INT_IN(cl)), \ + regmap_reg_range(GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl), \ + GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl)) + + CLUSTER_NONCPU_RANGE(0), + CLUSTER_NONCPU_RANGE(1), + CLUSTER_NONCPU_RANGE(2), + regmap_reg_range(GS101_CLUSTER_NONCPU_INT_EN(2), + GS101_CLUSTER_NONCPU_INT_DIR(2)), +#undef CLUSTER_NONCPU_RANGE + +#define SUBBLK_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_IN(blk), GS101_SUBBLK_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_INT_IN(blk), \ + GS101_SUBBLK_INT_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_MEMORY_IN(blk), \ + GS101_SUBBLK_MEMORY_IN(blk)) + + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ALIVE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_AOC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_APM), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CMU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BUS2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CORE), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EH), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CPUCL2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_EMBEDDED_G3D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_HSI2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DISP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G2D), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MFC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_CSIS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PDP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_DNS), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_G3AA), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_IPP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_ITP), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MCSC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_GDC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TNR), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_BO), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_TPU), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF2), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MIF3), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_MISC), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC0), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_PERIC1), + SUBBLK_RANGE(GS101_SUBBBLK_OFFSET_S2D), +#undef SUBBLK_RANGE + +#define SUBBLK_CPU_RANGE(blk) \ + regmap_reg_range(GS101_SUBBLK_CPU_IN(blk), \ + GS101_SUBBLK_CPU_IN(blk)), \ + regmap_reg_range(GS101_SUBBLK_CPU_INT_IN(blk), \ + GS101_SUBBLK_CPU_INT_IN(blk)) + + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_APM), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_DBGCORE), + SUBBLK_CPU_RANGE(GS101_SUBBBLK_CPU_OFFSET_SSS), +#undef SUBBLK_CPU_RANGE + + regmap_reg_range(GS101_MIF_CONFIGURATION, GS101_MIF_CONFIGURATION), + regmap_reg_range(GS101_MIF_IN, GS101_MIF_IN), + regmap_reg_range(GS101_MIF_INT_IN, GS101_MIF_INT_IN), + regmap_reg_range(GS101_TOP_IN, GS101_TOP_IN), + regmap_reg_range(GS101_TOP_INT_IN, GS101_TOP_INT_IN), + regmap_reg_range(GS101_WAKEUP2_INT_IN, GS101_WAKEUP2_INT_IN), + regmap_reg_range(GS101_SYSTEM_IN, GS101_SYSTEM_IN), + regmap_reg_range(GS101_SYSTEM_INT_IN, GS101_SYSTEM_INT_IN), + regmap_reg_range(GS101_EINT_INT_IN, GS101_EINT_INT_IN), + regmap_reg_range(GS101_EINT2_INT_IN, GS101_EINT2_INT_IN), + regmap_reg_range(GS101_EINT3_INT_IN, GS101_EINT3_INT_IN), + regmap_reg_range(GS101_USER_DEFINED_INT_IN, GS101_USER_DEFINED_INT_IN), + regmap_reg_range(GS101_SCAN2DRAM_INT_IN, GS101_SCAN2DRAM_INT_IN), + regmap_reg_range(GS101_CUSTOM_IN, GS101_CUSTOM_IN), + regmap_reg_range(GS101_CUSTOM_INT_IN, GS101_CUSTOM_INT_IN), + regmap_reg_range(GS101_HCU_R(0), GS101_HCU_R(3)), + regmap_reg_range(GS101_HCU_SP, GS101_HCU_PC), + regmap_reg_range(GS101_NMI_SRC_IN, GS101_NMI_SRC_IN), + regmap_reg_range(GS101_HPM_INT_IN, GS101_HPM_INT_IN), + regmap_reg_range(GS101_MEMORY_PGEN_FEEDBACK, GS101_MEMORY_PGEN_FEEDBACK), + regmap_reg_range(GS101_MEMORY_SMX_FEEDBACK, GS101_MEMORY_SMX_FEEDBACK), + regmap_reg_range(GS101_PMLINK_SLC_ACK, GS101_PMLINK_SLC_BUSY), + regmap_reg_range(GS101_BOOTSYNC_IN, GS101_BOOTSYNC_IN), + regmap_reg_range(GS101_SCAN_READY_IN, GS101_SCAN_READY_IN), + regmap_reg_range(GS101_CTRL_PLL_ALV_LOCK, GS101_CTRL_PLL_ALV_LOCK), +}; + +static const struct regmap_access_table gs101_pmu_rd_table = { + .yes_ranges = gs101_pmu_registers, + .n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers), +}; + +static const struct regmap_access_table gs101_pmu_wr_table = { + .yes_ranges = gs101_pmu_registers, + .n_yes_ranges = ARRAY_SIZE(gs101_pmu_registers), + .no_ranges = gs101_pmu_ro_registers, + .n_no_ranges = ARRAY_SIZE(gs101_pmu_ro_registers), +}; + const struct exynos_pmu_data gs101_pmu_data = { .pmu_secure = true, .pmu_cpuhp = true, + .rd_table = &gs101_pmu_rd_table, + .wr_table = &gs101_pmu_wr_table, }; /* @@ -124,7 +428,7 @@ static bool tensor_is_atomic(unsigned int reg) return false; switch (reg) { - case GS101_SYSIP_DAT0: + case GS101_SYSIP_DAT(0): case GS101_SYSTEM_CONFIGURATION: return false; default: diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index 71e0c09a49eb..532c6c2d1195 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -672,14 +672,341 @@ /* For Tensor GS101 */ /* PMU ALIVE */ -#define GS101_SYSIP_DAT0 (0x810) -#define GS101_CPU0_INFORM (0x860) -#define GS101_CPU_INFORM(cpu) \ - (GS101_CPU0_INFORM + (cpu*4)) -#define GS101_SYSTEM_CONFIGURATION (0x3A00) -#define GS101_EINT_WAKEUP_MASK (0x3A80) -#define GS101_PHY_CTRL_USB20 (0x3EB0) -#define GS101_PHY_CTRL_USBDP (0x3EB4) +#define GS101_OM_STAT 0x0000 +#define GS101_VERSION 0x0004 +#define GS101_PORESET_CHECK 0x0008 +#define GS101_OTP_STATUS 0x000c +#define GS101_SYSTEM_INFO 0x0010 +#define GS101_IDLE_IP(n) (0x03e0 + ((n) & 3) * 4) +#define GS101_IDLE_IP_MASK(n) (0x03f0 + ((n) & 3) * 4) +#define GS101_SLC_CH_OFFSET(ch) (0x0400 + ((ch) & 3) * 0x10) +#define GS101_DATARAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x00) +#define GS101_TAGRAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x04) +#define GS101_LRURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x08) +#define GS101_PPMPURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x0c) +#define GS101_DATARAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x40) +#define GS101_TAGRAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x44) +#define GS101_LRURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x48) +#define GS101_PPMPURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x4c) +#define GS101_INFORM0 0x0800 +#define GS101_INFORM1 0x0804 +#define GS101_INFORM2 0x0808 +#define GS101_INFORM3 0x080c +#define GS101_SYSIP_DAT(n) (0x0810 + ((n) & 3) * 4) +#define GS101_PWR_HOLD_HW_TRIP 0x0820 +#define GS101_PWR_HOLD_SW_TRIP 0x0824 +#define GS101_GSA_INFORM(n) (0x0830 + ((n) & 1) * 4) +#define GS101_INFORM4 0x0840 +#define GS101_INFORM5 0x0844 +#define GS101_INFORM6 0x0848 +#define GS101_INFORM7 0x084c +#define GS101_INFORM8 0x0850 +#define GS101_INFORM9 0x0854 +#define GS101_INFORM10 0x0858 +#define GS101_INFORM11 0x085c +#define GS101_CPU_INFORM(cpu) (0x0860 + ((cpu) & 7) * 4) +#define GS101_IROM_INFORM 0x0880 +#define GS101_IROM_CPU_INFORM(cpu) (0x0890 + ((cpu) & 7) * 4) +#define GS101_PMU_SPARE(n) (0x0900 + ((n) & 3) * 4) +#define GS101_IROM_DATA_REG(n) (0x0980 + ((n) & 3) * 4) +#define GS101_IROM_PWRMODE 0x0990 +#define GS101_DREX_CALIBRATION(n) (0x09a0 + ((n) & 7) * 4) + +#define GS101_CLUSTER0_OFFSET 0x1000 +#define GS101_CLUSTER1_OFFSET 0x1300 +#define GS101_CLUSTER2_OFFSET 0x1500 +#define GS101_CLUSTER_CPU_OFFSET(cl, cpu) ((cl) + ((cpu) * 0x80)) +#define GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x00) +#define GS101_CLUSTER_CPU_STATUS(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x04) +#define GS101_CLUSTER_CPU_STATES(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x08) +#define GS101_CLUSTER_CPU_OPTION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x0c) +#define GS101_CLUSTER_CPU_OUT(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x20) +#define GS101_CLUSTER_CPU_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x24) +#define GS101_CLUSTER_CPU_INT_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x40) +#define GS101_CLUSTER_CPU_INT_EN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x44) +#define GS101_CLUSTER_CPU_INT_TYPE(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x48) +#define GS101_CLUSTER_CPU_INT_DIR(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x4c) + +#define GS101_CLUSTER_NONCPU_OFFSET(cl) (0x1200 + ((cl) * 0x200)) +#define GS101_CLUSTER_NONCPU_CONFIGURATION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x00) +#define GS101_CLUSTER_NONCPU_STATUS(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x04) +#define GS101_CLUSTER_NONCPU_STATES(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x08) +#define GS101_CLUSTER_NONCPU_OPTION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x0c) +#define GS101_CLUSTER_NONCPU_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x20) +#define GS101_CLUSTER_NONCPU_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x24) +#define GS101_CLUSTER_NONCPU_INT_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x40) +#define GS101_CLUSTER_NONCPU_INT_EN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x44) +#define GS101_CLUSTER_NONCPU_INT_TYPE(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x48) +#define GS101_CLUSTER_NONCPU_INT_DIR(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x4c) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x60) +#define GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x64) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x6c) +#define GS101_CLUSTER0_NONCPU_DSU_PCH \ + (GS101_CLUSTER_NONCPU_OFFSET(0) + 0x80) + +#define GS101_SUBBBLK_OFFSET_ALIVE 0x1800 +#define GS101_SUBBBLK_OFFSET_AOC 0x1880 +#define GS101_SUBBBLK_OFFSET_APM 0x1900 +#define GS101_SUBBBLK_OFFSET_CMU 0x1980 +#define GS101_SUBBBLK_OFFSET_BUS0 0x1a00 +#define GS101_SUBBBLK_OFFSET_BUS1 0x1a80 +#define GS101_SUBBBLK_OFFSET_BUS2 0x1b00 +#define GS101_SUBBBLK_OFFSET_CORE 0x1b80 +#define GS101_SUBBBLK_OFFSET_EH 0x1c00 +#define GS101_SUBBBLK_OFFSET_CPUCL0 0x1c80 +#define GS101_SUBBBLK_OFFSET_CPUCL1 0x1d00 +#define GS101_SUBBBLK_OFFSET_CPUCL2 0x1d80 +#define GS101_SUBBBLK_OFFSET_G3D 0x1e00 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0 0x1e80 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_G3D 0x2000 +#define GS101_SUBBBLK_OFFSET_HSI0 0x2080 +#define GS101_SUBBBLK_OFFSET_HSI1 0x2100 +#define GS101_SUBBBLK_OFFSET_HSI2 0x2180 +#define GS101_SUBBBLK_OFFSET_DPU 0x2200 +#define GS101_SUBBBLK_OFFSET_DISP 0x2280 +#define GS101_SUBBBLK_OFFSET_G2D 0x2300 +#define GS101_SUBBBLK_OFFSET_MFC 0x2380 +#define GS101_SUBBBLK_OFFSET_CSIS 0x2400 +#define GS101_SUBBBLK_OFFSET_PDP 0x2480 +#define GS101_SUBBBLK_OFFSET_DNS 0x2500 +#define GS101_SUBBBLK_OFFSET_G3AA 0x2580 +#define GS101_SUBBBLK_OFFSET_IPP 0x2600 +#define GS101_SUBBBLK_OFFSET_ITP 0x2680 +#define GS101_SUBBBLK_OFFSET_MCSC 0x2700 +#define GS101_SUBBBLK_OFFSET_GDC 0x2780 +#define GS101_SUBBBLK_OFFSET_TNR 0x2800 +#define GS101_SUBBBLK_OFFSET_BO 0x2880 +#define GS101_SUBBBLK_OFFSET_TPU 0x2900 +#define GS101_SUBBBLK_OFFSET_MIF0 0x2980 +#define GS101_SUBBBLK_OFFSET_MIF1 0x2a00 +#define GS101_SUBBBLK_OFFSET_MIF2 0x2a80 +#define GS101_SUBBBLK_OFFSET_MIF3 0x2b00 +#define GS101_SUBBBLK_OFFSET_MISC 0x2b80 +#define GS101_SUBBBLK_OFFSET_PERIC0 0x2c00 +#define GS101_SUBBBLK_OFFSET_PERIC1 0x2c80 +#define GS101_SUBBBLK_OFFSET_S2D 0x2d00 +#define GS101_SUBBLK_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CTRL(blk) ((blk) + 0x10) +#define GS101_SUBBLK_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_INT_DIR(blk) ((blk) + 0x4c) +#define GS101_SUBBLK_MEMORY_OUT(blk) ((blk) + 0x60) +#define GS101_SUBBLK_MEMORY_IN(blk) ((blk) + 0x64) + +#define GS101_SUBBBLK_CPU_OFFSET_APM 0x3000 +#define GS101_SUBBBLK_CPU_OFFSET_DBGCORE 0x3080 +#define GS101_SUBBBLK_CPU_OFFSET_SSS 0x3100 +#define GS101_SUBBLK_CPU_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_CPU_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_CPU_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_CPU_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CPU_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_CPU_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_CPU_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_CPU_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_CPU_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_CPU_INT_DIR(blk) ((blk) + 0x4c) + +#define GS101_MIF_CONFIGURATION 0x3800 +#define GS101_MIF_STATUS 0x3804 +#define GS101_MIF_STATES 0x3808 +#define GS101_MIF_OPTION 0x380c +#define GS101_MIF_CTRL 0x3810 +#define GS101_MIF_OUT 0x3820 +#define GS101_MIF_IN 0x3824 +#define GS101_MIF_INT_IN 0x3840 +#define GS101_MIF_INT_EN 0x3844 +#define GS101_MIF_INT_TYPE 0x3848 +#define GS101_MIF_INT_DIR 0x384c +#define GS101_TOP_CONFIGURATION 0x3900 +#define GS101_TOP_STATUS 0x3904 +#define GS101_TOP_STATES 0x3908 +#define GS101_TOP_OPTION 0x390c +#define GS101_TOP_OUT 0x3920 +#define GS101_TOP_IN 0x3924 +#define GS101_TOP_INT_IN 0x3940 +#define GS101_TOP_INT_EN 0x3944 +#define GS101_TOP_INT_TYPE 0x3948 +#define GS101_TOP_INT_DIR 0x394c +#define GS101_WAKEUP_STAT 0x3950 +#define GS101_WAKEUP2_STAT 0x3954 +#define GS101_WAKEUP2_INT_IN 0x3960 +#define GS101_WAKEUP2_INT_EN 0x3964 +#define GS101_WAKEUP2_INT_TYPE 0x3968 +#define GS101_WAKEUP2_INT_DIR 0x396c +#define GS101_SYSTEM_CONFIGURATION 0x3a00 +#define GS101_SYSTEM_STATUS 0x3a04 +#define GS101_SYSTEM_STATES 0x3a08 +#define GS101_SYSTEM_OPTION 0x3a0c +#define GS101_SYSTEM_CTRL 0x3a10 +#define GS101_SPARE_CTRL 0x3a14 +#define GS101_USER_DEFINED_OUT 0x3a18 +#define GS101_SYSTEM_OUT 0x3a20 +#define GS101_SYSTEM_IN 0x3a24 +#define GS101_SYSTEM_INT_IN 0x3a40 +#define GS101_SYSTEM_INT_EN 0x3a44 +#define GS101_SYSTEM_INT_TYPE 0x3a48 +#define GS101_SYSTEM_INT_DIR 0x3a4c +#define GS101_EINT_INT_IN 0x3a50 +#define GS101_EINT_INT_EN 0x3a54 +#define GS101_EINT_INT_TYPE 0x3a58 +#define GS101_EINT_INT_DIR 0x3a5c +#define GS101_EINT2_INT_IN 0x3a60 +#define GS101_EINT2_INT_EN 0x3a64 +#define GS101_EINT2_INT_TYPE 0x3a68 +#define GS101_EINT2_INT_DIR 0x3a6c +#define GS101_EINT3_INT_IN 0x3a70 +#define GS101_EINT3_INT_EN 0x3a74 +#define GS101_EINT3_INT_TYPE 0x3a78 +#define GS101_EINT3_INT_DIR 0x3a7c +#define GS101_EINT_WAKEUP_MASK 0x3a80 +#define GS101_EINT_WAKEUP_MASK2 0x3a84 +#define GS101_EINT_WAKEUP_MASK3 0x3a88 +#define GS101_USER_DEFINED_INT_IN 0x3a90 +#define GS101_USER_DEFINED_INT_EN 0x3a94 +#define GS101_USER_DEFINED_INT_TYPE 0x3a98 +#define GS101_USER_DEFINED_INT_DIR 0x3a9c +#define GS101_SCAN2DRAM_INT_IN 0x3aa0 +#define GS101_SCAN2DRAM_INT_EN 0x3aa4 +#define GS101_SCAN2DRAM_INT_TYPE 0x3aa8 +#define GS101_SCAN2DRAM_INT_DIR 0x3aac +#define GS101_HCU_START 0x3ab0 +#define GS101_CUSTOM_OUT 0x3ac0 +#define GS101_CUSTOM_IN 0x3ac4 +#define GS101_CUSTOM_INT_IN 0x3ad0 +#define GS101_CUSTOM_INT_EN 0x3ad4 +#define GS101_CUSTOM_INT_TYPE 0x3ad8 +#define GS101_CUSTOM_INT_DIR 0x3adc +#define GS101_ACK_LAST_CPU 0x3afc +#define GS101_HCU_R(n) (0x3b00 + ((n) & 3) * 4) +#define GS101_HCU_SP 0x3b14 +#define GS101_HCU_PC 0x3b18 +#define GS101_PMU_RAM_CTRL 0x3b20 +#define GS101_APM_HCU_CTRL 0x3b24 +#define GS101_APM_NMI_ENABLE 0x3b30 +#define GS101_DBGCORE_NMI_ENABLE 0x3b34 +#define GS101_HCU_NMI_ENABLE 0x3b38 +#define GS101_PWR_HOLD_WDT_ENABLE 0x3b3c +#define GS101_NMI_SRC_IN 0x3b40 +#define GS101_RST_STAT 0x3b44 +#define GS101_RST_STAT_PMU 0x3b48 +#define GS101_HPM_INT_IN 0x3b60 +#define GS101_HPM_INT_EN 0x3b64 +#define GS101_HPM_INT_TYPE 0x3b68 +#define GS101_HPM_INT_DIR 0x3b6c +#define GS101_S2D_AUTH 0x3b70 +#define GS101_BOOT_STAT 0x3b74 +#define GS101_PMLINK_OUT 0x3c00 +#define GS101_PMLINK_AOC_OUT 0x3c04 +#define GS101_PMLINK_AOC_CTRL 0x3c08 +#define GS101_TCXO_BUF_CTRL 0x3c10 +#define GS101_ADD_CTRL 0x3c14 +#define GS101_HCU_TIMEOUT_RESET 0x3c20 +#define GS101_HCU_TIMEOUT_SCAN2DRAM 0x3c24 +#define GS101_TIMER(n) (0x3c80 + ((n) & 3) * 4) +#define GS101_PPC_MIF(n) (0x3c90 + ((n) & 3) * 4) +#define GS101_PPC_CORE 0x3ca0 +#define GS101_PPC_EH 0x3ca4 +#define GS101_PPC_CPUCL1_0 0x3ca8 +#define GS101_PPC_CPUCL1_1 0x3cac +#define GS101_EXT_REGULATOR_MIF_DURATION 0x3cb0 +#define GS101_EXT_REGULATOR_TOP_DURATION 0x3cb4 +#define GS101_EXT_REGULATOR_CPUCL2_DURATION 0x3cb8 +#define GS101_EXT_REGULATOR_CPUCL1_DURATION 0x3cbc +#define GS101_EXT_REGULATOR_G3D_DURATION 0x3cc0 +#define GS101_EXT_REGULATOR_TPU_DURATION 0x3cc4 +#define GS101_TCXO_DURATION 0x3cc8 +#define GS101_BURNIN_CTRL 0x3cd0 +#define GS101_JTAG_DBG_DET 0x3cd4 +#define GS101_MMC_CONWKUP_CTRL 0x3cd8 +#define GS101_USBDPPHY0_USBDP_WAKEUP 0x3cdc +#define GS101_TMU_TOP_TRIP 0x3ce0 +#define GS101_TMU_SUB_TRIP 0x3ce4 +#define GS101_MEMORY_CEN 0x3d00 +#define GS101_MEMORY_PGEN 0x3d04 +#define GS101_MEMORY_RET 0x3d08 +#define GS101_MEMORY_PGEN_FEEDBACK 0x3d0c +#define GS101_MEMORY_SMX 0x3d10 +#define GS101_MEMORY_SMX_FEEDBACK 0x3d14 +#define GS101_SLC_PCH_CHANNEL 0x3d20 +#define GS101_SLC_PCH_CB 0x3d24 +#define GS101_FORCE_NOMC 0x3d3c +#define GS101_FORCE_BOOST 0x3d4c +#define GS101_PMLINK_SLC_REQ 0x3d50 +#define GS101_PMLINK_SLC_ACK 0x3d54 +#define GS101_PMLINK_SLC_BUSY 0x3d58 +#define GS101_BOOTSYNC_OUT 0x3d80 +#define GS101_BOOTSYNC_IN 0x3d84 +#define GS101_SCAN_READY_OUT 0x3d88 +#define GS101_SCAN_READY_IN 0x3d8c +#define GS101_GSA_RESTORE 0x3d90 +#define GS101_ALIVE_OTP_LATCH 0x3d94 +#define GS101_DEBUG_OVERRIDE 0x3d98 +#define GS101_WDT_OPTION 0x3d9c +#define GS101_AOC_WDT_CFG 0x3da0 +#define GS101_CTRL_SECJTAG_ALIVE 0x3da4 +#define GS101_CTRL_DIV_PLL_ALV_DIVLOW 0x3e00 +#define GS101_CTRL_MUX_CLK_APM_REFSRC_AUTORESTORE 0x3e04 +#define GS101_CTRL_MUX_CLK_APM_REFSRC 0x3e08 +#define GS101_CTRL_MUX_CLK_APM_REF 0x3e0c +#define GS101_CTRL_MUX_PLL_ALV_DIV4 0x3e10 +#define GS101_CTRL_PLL_ALV_DIV4 0x3e14 +#define GS101_CTRL_OSCCLK_APMGSA 0x3e18 +#define GS101_CTRL_BLK_AOC_CLKS 0x3e1c +#define GS101_CTRL_PLL_ALV_LOCK 0x3e20 +#define GS101_CTRL_CLKDIV__CLKRTC 0x3e24 +#define GS101_CTRL_SOC32K 0x3e30 +#define GS101_CTRL_STM_PMU 0x3e34 +#define GS101_CTRL_PMU_DEBUG 0x3e38 +#define GS101_CTRL_DEBUG_UART 0x3e3c +#define GS101_CTRL_TCK 0x3e40 +#define GS101_CTRL_SBU_SW_EN 0x3e44 +#define GS101_PAD_CTRL_CLKOUT0 0x3e80 +#define GS101_PAD_CTRL_CLKOUT1 0x3e84 +#define GS101_PAD_CTRL_APM_24MOUT_0 0x3e88 +#define GS101_PAD_CTRL_APM_24MOUT_1 0x3e8c +#define GS101_PAD_CTRL_IO_FORCE_RETENTION 0x3e90 +#define GS101_PAD_CTRL_APACTIVE_n 0x3e94 +#define GS101_PAD_CTRL_TCXO_ON 0x3e98 +#define GS101_PAD_CTRL_PWR_HOLD 0x3e9c +#define GS101_PAD_CTRL_RESETO_n 0x3ea0 +#define GS101_PAD_CTRL_WRESETO_n 0x3ea4 +#define GS101_PHY_CTRL_USB20 0x3eb0 +#define GS101_PHY_CTRL_USBDP 0x3eb4 +#define GS101_PHY_CTRL_MIPI_DCPHY_M4M4 0x3eb8 +#define GS101_PHY_CTRL_MIPI_DCPHY_S4S4S4S4 0x3ebc +#define GS101_PHY_CTRL_PCIE_GEN4_0 0x3ec0 +#define GS101_PHY_CTRL_PCIE_GEN4_1 0x3ec4 +#define GS101_PHY_CTRL_UFS 0x3ec8 /* PMU INTR GEN */ #define GS101_GRP1_INTR_BID_UPEND (0x0108) -- cgit v1.2.3 From edd548dc64a699d71ea4f537f815044e763d01e1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 12:13:23 -0700 Subject: firmware: qcom: tzmem: fix qcom_tzmem_policy kernel-doc Fix kernel-doc warnings by using correct kernel-doc syntax and formatting to prevent warnings: Warning: include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_STATIC' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_MULTIPLIER' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_ON_DEMAND' not described in enum 'qcom_tzmem_policy' Fixes: 84f5a7b67b61 ("firmware: qcom: add a dedicated TrustZone buffer allocator") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20251017191323.1820167-1-rdunlap@infradead.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index 48ac0e5454c7..23173e0c3ddd 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -17,11 +17,20 @@ struct qcom_tzmem_pool; * enum qcom_tzmem_policy - Policy for pool growth. */ enum qcom_tzmem_policy { - /**< Static pool, never grow above initial size. */ + /** + * @QCOM_TZMEM_POLICY_STATIC: Static pool, + * never grow above initial size. + */ QCOM_TZMEM_POLICY_STATIC = 1, - /**< When out of memory, add increment * current size of memory. */ + /** + * @QCOM_TZMEM_POLICY_MULTIPLIER: When out of memory, + * add increment * current size of memory. + */ QCOM_TZMEM_POLICY_MULTIPLIER, - /**< When out of memory add as much as is needed until max_size. */ + /** + * @QCOM_TZMEM_POLICY_ON_DEMAND: When out of memory + * add as much as is needed until max_size. + */ QCOM_TZMEM_POLICY_ON_DEMAND, }; -- cgit v1.2.3 From 84a222d1b369ba83f8947948670f775367e653f1 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 10 Oct 2025 12:46:32 +0000 Subject: firmware: exynos-acpm: add DVFS protocol Add ACPM DVFS protocol handler. It constructs DVFS messages that the APM firmware can understand. Signed-off-by: Tudor Ambarus Reviewed-by: Peter Griffin Tested-by: Peter Griffin # on gs101-oriole Link: https://patch.msgid.link/20251010-acpm-clk-v6-2-321ee8826fd4@linaro.org Signed-off-by: Krzysztof Kozlowski --- drivers/firmware/samsung/Makefile | 4 +- drivers/firmware/samsung/exynos-acpm-dvfs.c | 80 ++++++++++++++++++++++ drivers/firmware/samsung/exynos-acpm-dvfs.h | 21 ++++++ drivers/firmware/samsung/exynos-acpm.c | 5 ++ .../linux/firmware/samsung/exynos-acpm-protocol.h | 10 +++ 5 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.c create mode 100644 drivers/firmware/samsung/exynos-acpm-dvfs.h (limited to 'include/linux') diff --git a/drivers/firmware/samsung/Makefile b/drivers/firmware/samsung/Makefile index 7b4c9f6f34f5..80d4f89b33a9 100644 --- a/drivers/firmware/samsung/Makefile +++ b/drivers/firmware/samsung/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -acpm-protocol-objs := exynos-acpm.o exynos-acpm-pmic.o +acpm-protocol-objs := exynos-acpm.o +acpm-protocol-objs += exynos-acpm-pmic.o +acpm-protocol-objs += exynos-acpm-dvfs.o obj-$(CONFIG_EXYNOS_ACPM_PROTOCOL) += acpm-protocol.o diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.c b/drivers/firmware/samsung/exynos-acpm-dvfs.c new file mode 100644 index 000000000000..1c5b2b143bcc --- /dev/null +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020 Samsung Electronics Co., Ltd. + * Copyright 2020 Google LLC. + * Copyright 2025 Linaro Ltd. + */ + +#include +#include +#include +#include +#include + +#include "exynos-acpm.h" +#include "exynos-acpm-dvfs.h" + +#define ACPM_DVFS_ID GENMASK(11, 0) +#define ACPM_DVFS_REQ_TYPE GENMASK(15, 0) + +#define ACPM_DVFS_FREQ_REQ 0 +#define ACPM_DVFS_FREQ_GET 1 + +static void acpm_dvfs_set_xfer(struct acpm_xfer *xfer, u32 *cmd, size_t cmdlen, + unsigned int acpm_chan_id, bool response) +{ + xfer->acpm_chan_id = acpm_chan_id; + xfer->txd = cmd; + xfer->txlen = cmdlen; + + if (response) { + xfer->rxd = cmd; + xfer->rxlen = cmdlen; + } +} + +static void acpm_dvfs_init_set_rate_cmd(u32 cmd[4], unsigned int clk_id, + unsigned long rate) +{ + cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id); + cmd[1] = rate / HZ_PER_KHZ; + cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_REQ); + cmd[3] = ktime_to_ms(ktime_get()); +} + +int acpm_dvfs_set_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id, + unsigned long rate) +{ + struct acpm_xfer xfer = {0}; + u32 cmd[4]; + + acpm_dvfs_init_set_rate_cmd(cmd, clk_id, rate); + acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, false); + + return acpm_do_xfer(handle, &xfer); +} + +static void acpm_dvfs_init_get_rate_cmd(u32 cmd[4], unsigned int clk_id) +{ + cmd[0] = FIELD_PREP(ACPM_DVFS_ID, clk_id); + cmd[2] = FIELD_PREP(ACPM_DVFS_REQ_TYPE, ACPM_DVFS_FREQ_GET); + cmd[3] = ktime_to_ms(ktime_get()); +} + +unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id) +{ + struct acpm_xfer xfer; + unsigned int cmd[4] = {0}; + int ret; + + acpm_dvfs_init_get_rate_cmd(cmd, clk_id); + acpm_dvfs_set_xfer(&xfer, cmd, sizeof(cmd), acpm_chan_id, true); + + ret = acpm_do_xfer(handle, &xfer); + if (ret) + return 0; + + return xfer.rxd[1] * HZ_PER_KHZ; +} diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.h b/drivers/firmware/samsung/exynos-acpm-dvfs.h new file mode 100644 index 000000000000..9f2778e649c9 --- /dev/null +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2020 Samsung Electronics Co., Ltd. + * Copyright 2020 Google LLC. + * Copyright 2025 Linaro Ltd. + */ +#ifndef __EXYNOS_ACPM_DVFS_H__ +#define __EXYNOS_ACPM_DVFS_H__ + +#include + +struct acpm_handle; + +int acpm_dvfs_set_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int id, + unsigned long rate); +unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, + unsigned int acpm_chan_id, + unsigned int clk_id); + +#endif /* __EXYNOS_ACPM_DVFS_H__ */ diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c index 3a69fe3234c7..9fa0335ccf5d 100644 --- a/drivers/firmware/samsung/exynos-acpm.c +++ b/drivers/firmware/samsung/exynos-acpm.c @@ -29,6 +29,7 @@ #include #include "exynos-acpm.h" +#include "exynos-acpm-dvfs.h" #include "exynos-acpm-pmic.h" #define ACPM_PROTOCOL_SEQNUM GENMASK(21, 16) @@ -590,8 +591,12 @@ static int acpm_channels_init(struct acpm_info *acpm) */ static void acpm_setup_ops(struct acpm_info *acpm) { + struct acpm_dvfs_ops *dvfs_ops = &acpm->handle.ops.dvfs_ops; struct acpm_pmic_ops *pmic_ops = &acpm->handle.ops.pmic_ops; + dvfs_ops->set_rate = acpm_dvfs_set_rate; + dvfs_ops->get_rate = acpm_dvfs_get_rate; + pmic_ops->read_reg = acpm_pmic_read_reg; pmic_ops->bulk_read = acpm_pmic_bulk_read; pmic_ops->write_reg = acpm_pmic_write_reg; diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index f628bf1862c2..b1e95435240f 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -13,6 +13,15 @@ struct acpm_handle; struct device_node; +struct acpm_dvfs_ops { + int (*set_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id, + unsigned long rate); + unsigned long (*get_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, + unsigned int clk_id); +}; + struct acpm_pmic_ops { int (*read_reg)(const struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, @@ -32,6 +41,7 @@ struct acpm_pmic_ops { }; struct acpm_ops { + struct acpm_dvfs_ops dvfs_ops; struct acpm_pmic_ops pmic_ops; }; -- cgit v1.2.3 From 7f3779a3ac3e474d043f0a2b77dd6e6bb020c577 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:43 +0000 Subject: mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Add a mempolicy parameter to filemap_alloc_folio() to enable NUMA-aware page cache allocations. This will be used by upcoming changes to support NUMA policies in guest-memfd, where guest_memory need to be allocated NUMA policy specified by VMM. All existing users pass NULL maintaining current behavior. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-4-shivankg@amd.com Signed-off-by: Sean Christopherson --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/verity.c | 2 +- fs/erofs/zdata.c | 2 +- fs/f2fs/compress.c | 2 +- include/linux/pagemap.h | 8 +++++--- mm/filemap.c | 14 +++++++++----- mm/readahead.c | 2 +- 7 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..d927ae32e7d0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,8 +491,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, - ~__GFP_FS), 0); + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + 0, NULL); if (!folio) break; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..d4523d5debcd 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -742,7 +742,7 @@ again: } folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), - 0); + 0, NULL); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc80cfe482f7..b7369fb4fbe9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ - newfolio = filemap_alloc_folio(gfp, 0); + newfolio = filemap_alloc_folio(gfp, 0, NULL); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..a65e8cd388bc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, return; } - cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); if (!cfolio) return; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..f1d0610210f7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..7b42fd6dcc9a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1002,11 +1002,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -2009,7 +2014,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, NULL); if (!folio) continue; @@ -2551,7 +2556,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3983,8 +3988,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); diff --git a/mm/readahead.c b/mm/readahead.c index 3a4b5d58eeb6..b415c9969176 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl, { struct folio *folio; - folio = filemap_alloc_folio(gfp_mask, order); + folio = filemap_alloc_folio(gfp_mask, order, NULL); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); -- cgit v1.2.3 From 16a542e22339cd5e73e56a956bbd335c7bd7c08c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:44 +0000 Subject: mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Extend __filemap_get_folio() to support NUMA memory policies by renaming the implementation to __filemap_get_folio_mpol() and adding a mempolicy parameter. The original function becomes a static inline wrapper that passes NULL for the mempolicy. This infrastructure will enable future support for NUMA-aware page cache allocations in guest_memfd memory backend KVM guests. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-5-shivankg@amd.com Signed-off-by: Sean Christopherson --- include/linux/pagemap.h | 10 ++++++++-- mm/filemap.c | 11 ++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f1d0610210f7..a17fabbc0269 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -755,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). diff --git a/mm/filemap.c b/mm/filemap.c index 7b42fd6dcc9a..91c4537283d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1928,11 +1928,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1943,8 +1944,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2014,7 +2015,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order, NULL); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2061,7 +2062,7 @@ no_page: folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) -- cgit v1.2.3 From c5ebcc80fcf7d2c6ed917371f024d2da5bce9128 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:07:27 -0700 Subject: iio: adc: qcom-vadc-common: fix vadc_scale_fn_type kernel-doc Fix multiple warnings in enum vadc_scale_fn_type by adding a leading '@' to the kernel-doc descriptions. Fixed 14 warnings in this one enum, such as: Warning: include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_DEFAULT' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_THERM_100K_PULLUP' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_PMIC_THERM' not described in enum 'vadc_scale_fn_type' Also prevent the warning on SCALE_HW_CALIB_INVALID by marking it "private:" so that kernel-doc notation is not needed for it. This leaves only one warning here, which I don't know the appropriate description of: qcom-vadc-common.h:125: warning: Enum value 'SCALE_HW_CALIB_PMIC_THERM_PM7' not described in enum 'vadc_scale_fn_type' Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/qcom-vadc-common.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h index aa21b032e861..3bf4c49726a7 100644 --- a/include/linux/iio/adc/qcom-vadc-common.h +++ b/include/linux/iio/adc/qcom-vadc-common.h @@ -83,27 +83,27 @@ struct vadc_linear_graph { /** * enum vadc_scale_fn_type - Scaling function to convert ADC code to * physical scaled units for the channel. - * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). - * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. + * @SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). + * @SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. * Uses a mapping table with 100K pullup. - * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. - * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. - * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp - * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to + * @SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. + * @SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp + * @SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to * voltage (uV) with hardware applied offset/slope values to adc code. - * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using * lookup table. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using + * @SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using * 100k pullup. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using * lookup table for PMIC7. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. This is for PMIC7. - * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 * charger temperature. - * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 * SMB1390 temperature. */ enum vadc_scale_fn_type { @@ -120,6 +120,7 @@ enum vadc_scale_fn_type { SCALE_HW_CALIB_PMIC_THERM_PM7, SCALE_HW_CALIB_PM5_CHG_TEMP, SCALE_HW_CALIB_PM5_SMB_TEMP, + /* private: */ SCALE_HW_CALIB_INVALID, }; -- cgit v1.2.3 From 6837c006d4e72d6add451411bcf407e0dea4ad25 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 21 Oct 2025 15:24:47 +0000 Subject: firmware: exynos-acpm: add empty method to allow compile test Provide empty method for devm_acpm_get_by_node() if we aren't building in the CONFIG_EXYNOS_ACPM_PROTOCOL. This allows to test-build the CONFIG_EXYNOS_ACPM_CLK code. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202510211905.RgfWkgss-lkp@intel.com/ Fixes: 40498a742053 ("clk: samsung: add Exynos ACPM clock driver") Signed-off-by: Tudor Ambarus Link: https://patch.msgid.link/20251021-fix-acpm-clk-build-test-v1-1-236a3d6db7f5@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/firmware/samsung/exynos-acpm-protocol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index b1e95435240f..2091da965a5a 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -55,7 +55,16 @@ struct acpm_handle { struct device; +#if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, struct device_node *np); +#else + +static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) +{ + return NULL; +} +#endif #endif /* __EXYNOS_ACPM_PROTOCOL_H */ -- cgit v1.2.3 From fdd00d79dc0e8a3f90be65d5060c55bb115c0f43 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2025 20:35:43 -0700 Subject: ipack: fix ipack.h kernel-doc warnings Fix various kernel-doc warnings in ipack.h: - Remove an empty kernel-doc comment. - Add 2 missing struct short descriptions. - Fix a typo in a description. - Add a missing struct field description. - Add some missing Return descriptions. - Clarify one function short description. Warning: ../include/linux/ipack.h:73 Cannot find identifier on line: */ Warning: ../include/linux/ipack.h:74 Cannot find identifier on line: struct ipack_region { Warning: ../include/linux/ipack.h:75 Cannot find identifier on line: phys_addr_t start; Warning: ../include/linux/ipack.h:76 Cannot find identifier on line: size_t size; Warning: ../include/linux/ipack.h:77 Cannot find identifier on line: }; Warning: ../include/linux/ipack.h:78 Cannot find identifier on line: Warning: ../include/linux/ipack.h:79 Cannot find identifier on line: /** Warning: ipack.h:80 missing initial short description on line: * struct ipack_device Warning: ipack.h:163 missing initial short description on line: * struct ipack_bus_device Warning: ipack.h:130 struct member 'id_table' not described in 'ipack_driver' Warning: ipack.h:189 No description found for return value of 'ipack_bus_register' Warning: ipack.h:194 No description found for return value of 'ipack_bus_unregister' *** Warning: ipack.h:202 No description found for return value of 'ipack_driver_register' Warning: ipack.h:221 No description found for return value of 'ipack_device_init' Warning: ipack.h:236 No description found for return value of 'ipack_device_add' Warning: ipack.h:271 No description found for return value of 'ipack_get_carrier' Signed-off-by: Randy Dunlap Acked-by: Vaibhav Gupta Link: https://patch.msgid.link/20251016033543.1142049-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/ipack.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipack.h b/include/linux/ipack.h index 2c6936b8371f..455f6c2a1903 100644 --- a/include/linux/ipack.h +++ b/include/linux/ipack.h @@ -70,15 +70,13 @@ enum ipack_space { IPACK_SPACE_COUNT, }; -/** - */ struct ipack_region { phys_addr_t start; size_t size; }; /** - * struct ipack_device + * struct ipack_device - subsystem representation of an IPack device * * @slot: Slot where the device is plugged in the carrier board * @bus: ipack_bus_device where the device is plugged to. @@ -89,7 +87,7 @@ struct ipack_region { * * Warning: Direct access to mapped memory is possible but the endianness * is not the same with PCI carrier or VME carrier. The endianness is managed - * by the carrier board throught bus->ops. + * by the carrier board through bus->ops. */ struct ipack_device { unsigned int slot; @@ -124,6 +122,7 @@ struct ipack_driver_ops { * struct ipack_driver -- Specific data to each ipack device driver * * @driver: Device driver kernel representation + * @id_table: Device ID table for this driver * @ops: Callbacks provided by the IPack device driver */ struct ipack_driver { @@ -161,7 +160,7 @@ struct ipack_bus_ops { }; /** - * struct ipack_bus_device + * struct ipack_bus_device - IPack bus representation * * @dev: pointer to carrier device * @slots: number of slots available @@ -185,6 +184,8 @@ struct ipack_bus_device { * * The carrier board device should call this function to register itself as * available bus device in ipack. + * + * Return: %NULL on error or &struct ipack_bus_device on success */ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, const struct ipack_bus_ops *ops, @@ -192,6 +193,8 @@ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, /** * ipack_bus_unregister -- unregister an ipack bus + * + * Return: %0 */ int ipack_bus_unregister(struct ipack_bus_device *bus); @@ -200,6 +203,8 @@ int ipack_bus_unregister(struct ipack_bus_device *bus); * * Called by a ipack driver to register itself as a driver * that can manage ipack devices. + * + * Return: zero on success or error code on failure. */ int ipack_driver_register(struct ipack_driver *edrv, struct module *owner, const char *name); @@ -215,7 +220,7 @@ void ipack_driver_unregister(struct ipack_driver *edrv); * function. The rest of the fields will be allocated and populated * during initalization. * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -230,7 +235,7 @@ int ipack_device_init(struct ipack_device *dev); * Add a new IPack device. The call is done by the carrier driver * after calling ipack_device_init(). * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -266,9 +271,11 @@ void ipack_put_device(struct ipack_device *dev); .device = (dev) /** - * ipack_get_carrier - it increase the carrier ref. counter of + * ipack_get_carrier - try to increase the carrier ref. counter of * the carrier module * @dev: mezzanine device which wants to get the carrier + * + * Return: true on success. */ static inline int ipack_get_carrier(struct ipack_device *dev) { -- cgit v1.2.3 From 9fd2eb9e18a0a0b5a127937586388ed0181d9dac Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Sep 2025 15:42:40 +0200 Subject: cdx: make cdx_bus_type constant Now that the driver core can properly handle constant struct bus_type, move the cdx_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Nipun Gupta Cc: Nikhil Agarwal Acked-by: Nipun Gupta Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025091439-sustained-acorn-4af4@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/cdx/cdx.c | 4 ++-- include/linux/cdx/cdx_bus.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index 3d50f8cd9c0b..b39af2f1937f 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -170,7 +170,7 @@ static int cdx_unregister_device(struct device *dev, return 0; } -static void cdx_unregister_devices(struct bus_type *bus) +static void cdx_unregister_devices(const struct bus_type *bus) { /* Reset all the devices attached to cdx bus */ bus_for_each_dev(bus, NULL, NULL, cdx_unregister_device); @@ -651,7 +651,7 @@ static struct attribute *cdx_bus_attrs[] = { }; ATTRIBUTE_GROUPS(cdx_bus); -struct bus_type cdx_bus_type = { +const struct bus_type cdx_bus_type = { .name = "cdx", .match = cdx_bus_match, .probe = cdx_probe, diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 79bb80e56790..b1ba97f6c9ad 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -234,7 +234,7 @@ int __must_check __cdx_driver_register(struct cdx_driver *cdx_driver, */ void cdx_driver_unregister(struct cdx_driver *cdx_driver); -extern struct bus_type cdx_bus_type; +extern const struct bus_type cdx_bus_type; /** * cdx_dev_reset - Reset CDX device -- cgit v1.2.3 From 61e606305672342858a647af3629d9dfcc4e4265 Mon Sep 17 00:00:00 2001 From: Adrian Barnaś Date: Fri, 19 Sep 2025 06:53:27 +0000 Subject: drivers: eisa: make eisa_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the eisa_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919065327.672924-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/eisa/eisa-bus.c | 2 +- include/linux/eisa.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c index edceea083b98..bd76d599109c 100644 --- a/drivers/eisa/eisa-bus.c +++ b/drivers/eisa/eisa-bus.c @@ -135,7 +135,7 @@ static int eisa_bus_uevent(const struct device *dev, struct kobj_uevent_env *env return 0; } -struct bus_type eisa_bus_type = { +const struct bus_type eisa_bus_type = { .name = "eisa", .match = eisa_bus_match, .uevent = eisa_bus_uevent, diff --git a/include/linux/eisa.h b/include/linux/eisa.h index 21a2ecc1e538..cf55630b595b 100644 --- a/include/linux/eisa.h +++ b/include/linux/eisa.h @@ -68,7 +68,7 @@ struct eisa_driver { /* These external functions are only available when EISA support is enabled. */ #ifdef CONFIG_EISA -extern struct bus_type eisa_bus_type; +extern const struct bus_type eisa_bus_type; int eisa_driver_register (struct eisa_driver *edrv); void eisa_driver_unregister (struct eisa_driver *edrv); -- cgit v1.2.3 From 8ce6b508f24b4ef3a78c2c0d92e67b9e324c4f7a Mon Sep 17 00:00:00 2001 From: Adrian Barnaś Date: Fri, 19 Sep 2025 07:32:01 +0000 Subject: drivers: rapidio: make rio_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the rio_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919073201.751348-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/rapidio/rio-driver.c | 2 +- include/linux/rio.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 238250e69005..bcfe0b45b377 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -227,7 +227,7 @@ struct class rio_mport_class = { }; EXPORT_SYMBOL_GPL(rio_mport_class); -struct bus_type rio_bus_type = { +const struct bus_type rio_bus_type = { .name = "rapidio", .match = rio_match_bus, .dev_groups = rio_dev_groups, diff --git a/include/linux/rio.h b/include/linux/rio.h index 3c29f40f3c94..2c29f21ba9e5 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -78,7 +78,7 @@ #define RIO_CTAG_RESRVD 0xfffe0000 /* Reserved */ #define RIO_CTAG_UDEVID 0x0001ffff /* Unique device identifier */ -extern struct bus_type rio_bus_type; +extern const struct bus_type rio_bus_type; extern struct class rio_mport_class; struct rio_mport; -- cgit v1.2.3 From cebd22dd3a0ac76e0e1f2f369bba710bc6b1dc66 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Thu, 18 Sep 2025 10:54:02 +0800 Subject: platform: Use IOMEM_ERR_PTR for ioremap error returns Replace ERR_PTR() with IOMEM_ERR_PTR() in stubbed ioremap functions to maintain type consistency. The functions return void __iomem * pointers and IOMEM_ERR_PTR() provides proper type casting to avoid sparse warnings. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509060307.JubgnLhc-lkp@intel.com/ Signed-off-by: Pei Xiao Link: https://patch.msgid.link/320f2cc9ada5cb66845daa6bf259000b4cffd8b3.1758163939.git.xiaopei01@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..1d424fed1435 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -80,7 +80,7 @@ static inline void __iomem * devm_platform_get_and_ioremap_resource(struct platform_device *pdev, unsigned int index, struct resource **res) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } @@ -88,14 +88,14 @@ static inline void __iomem * devm_platform_ioremap_resource(struct platform_device *pdev, unsigned int index) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } static inline void __iomem * devm_platform_ioremap_resource_byname(struct platform_device *pdev, const char *name) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } #endif -- cgit v1.2.3 From 6d0ef68955d30be1e218caf160ec32eec23ebc6e Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Tue, 23 Sep 2025 09:54:09 +0800 Subject: arch_topology: move parse_acpi_topology() to common code Currently, RISC-V lacks arch-specific registers for CPU topology properties and must get them from ACPI. Thus, parse_acpi_topology() is moved from arm64/ to drivers/ for RISC-V reuse. Signed-off-by: Yunhui Cui Reviewed-by: Sudeep Holla Link: https://patch.msgid.link/20250923015409.15983-2-cuiyunhui@bytedance.com Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/topology.h | 3 ++ arch/arm64/kernel/topology.c | 101 -------------------------------------- drivers/base/arch_topology.c | 96 +++++++++++++++++++++++++++++++++++- include/linux/arch_topology.h | 5 ++ 4 files changed, 103 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 341174bf9106..b9eaf4ad7085 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -36,6 +36,9 @@ void update_freq_counters_refs(void); #define arch_scale_hw_pressure topology_get_hw_pressure #define arch_update_hw_pressure topology_update_hw_pressure +#undef arch_cpu_is_threaded +#define arch_cpu_is_threaded() (read_cpuid_mpidr() & MPIDR_MT_BITMASK) + #include #endif /* _ASM_ARM_TOPOLOGY_H */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 5d07ee85bdae..5d24dc53799b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -25,107 +25,6 @@ #include #include -#ifdef CONFIG_ACPI -static bool __init acpi_cpu_is_threaded(int cpu) -{ - int is_threaded = acpi_pptt_cpu_is_thread(cpu); - - /* - * if the PPTT doesn't have thread information, assume a homogeneous - * machine and return the current CPU's thread state. - */ - if (is_threaded < 0) - is_threaded = read_cpuid_mpidr() & MPIDR_MT_BITMASK; - - return !!is_threaded; -} - -struct cpu_smt_info { - unsigned int thread_num; - int core_id; -}; - -/* - * Propagate the topology information of the processor_topology_node tree to the - * cpu_topology array. - */ -int __init parse_acpi_topology(void) -{ - unsigned int max_smt_thread_num = 1; - struct cpu_smt_info *entry; - struct xarray hetero_cpu; - unsigned long hetero_id; - int cpu, topology_id; - - if (acpi_disabled) - return 0; - - xa_init(&hetero_cpu); - - for_each_possible_cpu(cpu) { - topology_id = find_acpi_cpu_topology(cpu, 0); - if (topology_id < 0) - return topology_id; - - if (acpi_cpu_is_threaded(cpu)) { - cpu_topology[cpu].thread_id = topology_id; - topology_id = find_acpi_cpu_topology(cpu, 1); - cpu_topology[cpu].core_id = topology_id; - - /* - * In the PPTT, CPUs below a node with the 'identical - * implementation' flag have the same number of threads. - * Count the number of threads for only one CPU (i.e. - * one core_id) among those with the same hetero_id. - * See the comment of find_acpi_cpu_topology_hetero_id() - * for more details. - * - * One entry is created for each node having: - * - the 'identical implementation' flag - * - its parent not having the flag - */ - hetero_id = find_acpi_cpu_topology_hetero_id(cpu); - entry = xa_load(&hetero_cpu, hetero_id); - if (!entry) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - WARN_ON_ONCE(!entry); - - if (entry) { - entry->core_id = topology_id; - entry->thread_num = 1; - xa_store(&hetero_cpu, hetero_id, - entry, GFP_KERNEL); - } - } else if (entry->core_id == topology_id) { - entry->thread_num++; - } - } else { - cpu_topology[cpu].thread_id = -1; - cpu_topology[cpu].core_id = topology_id; - } - topology_id = find_acpi_cpu_topology_cluster(cpu); - cpu_topology[cpu].cluster_id = topology_id; - topology_id = find_acpi_cpu_topology_package(cpu); - cpu_topology[cpu].package_id = topology_id; - } - - /* - * This is a short loop since the number of XArray elements is the - * number of heterogeneous CPU clusters. On a homogeneous system - * there's only one entry in the XArray. - */ - xa_for_each(&hetero_cpu, hetero_id, entry) { - max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); - xa_erase(&hetero_cpu, hetero_id); - kfree(entry); - } - - cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); - xa_destroy(&hetero_cpu); - return 0; -} -#endif - #ifdef CONFIG_ARM64_AMU_EXTN #define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) #define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1037169abb45..1ccb1eda4ce8 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -823,12 +823,106 @@ void remove_cpu_topology(unsigned int cpu) clear_cpu_topology(cpu); } +#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) +struct cpu_smt_info { + unsigned int thread_num; + int core_id; +}; + +static bool __init acpi_cpu_is_threaded(int cpu) +{ + int is_threaded = acpi_pptt_cpu_is_thread(cpu); + + /* + * if the PPTT doesn't have thread information, check for architecture + * specific fallback if available + */ + if (is_threaded < 0) + is_threaded = arch_cpu_is_threaded(); + + return !!is_threaded; +} + +/* + * Propagate the topology information of the processor_topology_node tree to the + * cpu_topology array. + */ __weak int __init parse_acpi_topology(void) { + unsigned int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; + int cpu, topology_id; + + if (acpi_disabled) + return 0; + + xa_init(&hetero_cpu); + + for_each_possible_cpu(cpu) { + topology_id = find_acpi_cpu_topology(cpu, 0); + if (topology_id < 0) + return topology_id; + + if (acpi_cpu_is_threaded(cpu)) { + cpu_topology[cpu].thread_id = topology_id; + topology_id = find_acpi_cpu_topology(cpu, 1); + cpu_topology[cpu].core_id = topology_id; + + /* + * In the PPTT, CPUs below a node with the 'identical + * implementation' flag have the same number of threads. + * Count the number of threads for only one CPU (i.e. + * one core_id) among those with the same hetero_id. + * See the comment of find_acpi_cpu_topology_hetero_id() + * for more details. + * + * One entry is created for each node having: + * - the 'identical implementation' flag + * - its parent not having the flag + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON_ONCE(!entry); + + if (entry) { + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } + } else { + cpu_topology[cpu].thread_id = -1; + cpu_topology[cpu].core_id = topology_id; + } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; + topology_id = find_acpi_cpu_topology_package(cpu); + cpu_topology[cpu].package_id = topology_id; + } + + /* + * This is a short loop since the number of XArray elements is the + * number of heterogeneous CPU clusters. On a homogeneous system + * there's only one entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } -#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void __init init_cpu_topology(void) { int cpu, ret; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..766ed9cf0e54 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -80,6 +80,11 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) + +#ifndef arch_cpu_is_threaded +#define arch_cpu_is_threaded() (0) +#endif + void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); -- cgit v1.2.3 From f82890c98f3e3fd61983e9021354c632ecd47427 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 15 Oct 2025 04:30:13 +0000 Subject: tcpm: Parse and log AVS APDO The USB PD specification introduced new Adjustable Voltage Supply (AVS) types for both Standard Power Range (SPR) and Extended Power Range (EPR) sources. Add definitions to correctly parse and handle the new AVS APDO. Use bitfield macros to add inline helper functions to extract voltage, current, power, and peak current fields to parse and log the details of the new EPR AVS and SPR AVS APDO. Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Reviewed-by: Kyle Tso Reviewed-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251015043017.3382908-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 15 +++++++++- include/linux/usb/pd.h | 69 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b2a568a5bc9b..c65aa8104950 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -823,10 +823,23 @@ static void tcpm_log_source_caps(struct tcpm_port *port) case PDO_TYPE_APDO: if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) scnprintf(msg, sizeof(msg), - "%u-%u mV, %u mA", + "PPS %u-%u mV, %u mA", pdo_pps_apdo_min_voltage(pdo), pdo_pps_apdo_max_voltage(pdo), pdo_pps_apdo_max_current(pdo)); + else if (pdo_apdo_type(pdo) == APDO_TYPE_EPR_AVS) + scnprintf(msg, sizeof(msg), + "EPR AVS %u-%u mV %u W peak_current: %u", + pdo_epr_avs_apdo_min_voltage_mv(pdo), + pdo_epr_avs_apdo_max_voltage_mv(pdo), + pdo_epr_avs_apdo_pdp_w(pdo), + pdo_epr_avs_apdo_src_peak_current(pdo)); + else if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) + scnprintf(msg, sizeof(msg), + "SPR AVS 9-15 V: %u mA 15-20 V: %u mA peak_current: %u", + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(pdo), + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(pdo), + pdo_spr_avs_apdo_src_peak_current(pdo)); else strcpy(msg, "undefined APDO"); break; diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 3068c3084eb6..6ccd1b2af993 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -6,6 +6,7 @@ #ifndef __LINUX_USB_PD_H #define __LINUX_USB_PD_H +#include #include #include #include @@ -271,9 +272,11 @@ enum pd_pdo_type { enum pd_apdo_type { APDO_TYPE_PPS = 0, + APDO_TYPE_EPR_AVS = 1, + APDO_TYPE_SPR_AVS = 2, }; -#define PDO_APDO_TYPE_SHIFT 28 /* Only valid value currently is 0x0 - PPS */ +#define PDO_APDO_TYPE_SHIFT 28 #define PDO_APDO_TYPE_MASK 0x3 #define PDO_APDO_TYPE(t) ((t) << PDO_APDO_TYPE_SHIFT) @@ -297,6 +300,35 @@ enum pd_apdo_type { PDO_PPS_APDO_MIN_VOLT(min_mv) | PDO_PPS_APDO_MAX_VOLT(max_mv) | \ PDO_PPS_APDO_MAX_CURR(max_ma)) +/* + * Applicable only to EPR AVS APDO source cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_EPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both EPR AVS APDO source and sink cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + * Table 6.22 EPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_EPR_AVS_APDO_MAX_VOLT GENMASK(25, 17) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_MIN_VOLT GENMASK(15, 8) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_PDP GENMASK(7, 0) /* 1W unit */ + +/* + * Applicable only SPR AVS APDO source cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_SPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both SPR AVS APDO source and sink cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + * Table 6.21 SPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ +#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -350,6 +382,41 @@ static inline unsigned int pdo_pps_apdo_max_current(u32 pdo) PDO_PPS_APDO_CURR_MASK) * 50; } +static inline unsigned int pdo_epr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_epr_avs_apdo_min_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_max_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_pdp_w(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PDP, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_9v_to_15v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR, pdo) * 10; +} + +static inline unsigned int pdo_spr_avs_apdo_15v_to_20v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR, pdo) * 10; +} + /* RDO: Request Data Object */ #define RDO_OBJ_POS_SHIFT 28 #define RDO_OBJ_POS_MASK 0x7 -- cgit v1.2.3 From 832c8d3fce77cf03cc225fc555c1bffa1c547ba1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 14 Oct 2025 18:06:47 +0200 Subject: usb: typec: ps883x: Add USB4 mode and TBT3 altmode support This chip can do some more than the driver currently describes. Add support for configuring it for various flavors of TBT3/USB4 operation. Reviewed-by: Jack Pham Signed-off-by: Konrad Dybcio Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251014-topic-ps883x_usb4-v1-3-e6adb1a4296e@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/ps883x.c | 29 +++++++++++++++++++++++++++++ include/linux/usb/typec_tbt.h | 1 + 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/typec/mux/ps883x.c b/drivers/usb/typec/mux/ps883x.c index 72f1e737ca4b..7c61629b36d6 100644 --- a/drivers/usb/typec/mux/ps883x.c +++ b/drivers/usb/typec/mux/ps883x.c @@ -14,15 +14,18 @@ #include #include #include +#include #include #include #include #include +#include #define REG_USB_PORT_CONN_STATUS_0 0x00 #define CONN_STATUS_0_CONNECTION_PRESENT BIT(0) #define CONN_STATUS_0_ORIENTATION_REVERSED BIT(1) +#define CONN_STATUS_0_ACTIVE_CABLE BIT(2) #define CONN_STATUS_0_USB_3_1_CONNECTED BIT(5) #define REG_USB_PORT_CONN_STATUS_1 0x01 @@ -34,6 +37,10 @@ #define REG_USB_PORT_CONN_STATUS_2 0x02 +#define CONN_STATUS_2_TBT_CONNECTED BIT(0) +#define CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT BIT(4) +#define CONN_STATUS_2_USB4_CONNECTED BIT(7) + struct ps883x_retimer { struct i2c_client *client; struct gpio_desc *reset_gpio; @@ -95,6 +102,8 @@ static int ps883x_configure(struct ps883x_retimer *retimer, int cfg0, static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state *state) { + struct typec_thunderbolt_data *tb_data; + const struct enter_usb_data *eudo_data; int cfg0 = CONN_STATUS_0_CONNECTION_PRESENT; int cfg1 = 0x00; int cfg2 = 0x00; @@ -120,6 +129,18 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state break; } break; + case USB_TYPEC_TBT_SID: + tb_data = state->data; + + /* Unconditional */ + cfg2 |= CONN_STATUS_2_TBT_CONNECTED; + + if (tb_data->cable_mode & TBT_CABLE_ACTIVE_PASSIVE) + cfg0 |= CONN_STATUS_0_ACTIVE_CABLE; + + if (tb_data->enter_vdo & TBT_ENTER_MODE_UNI_DIR_LSRX) + cfg2 |= CONN_STATUS_2_TBT_UNIDIR_LSRX_ACT_LT; + break; default: dev_err(&retimer->client->dev, "Got unsupported SID: 0x%x\n", state->alt->svid); @@ -135,6 +156,14 @@ static int ps883x_set(struct ps883x_retimer *retimer, struct typec_retimer_state case TYPEC_MODE_USB3: cfg0 |= CONN_STATUS_0_USB_3_1_CONNECTED; break; + case TYPEC_MODE_USB4: + eudo_data = state->data; + + cfg2 |= CONN_STATUS_2_USB4_CONNECTED; + + if (FIELD_GET(EUDO_CABLE_TYPE_MASK, eudo_data->eudo) != EUDO_CABLE_TYPE_PASSIVE) + cfg0 |= CONN_STATUS_0_ACTIVE_CABLE; + break; default: dev_err(&retimer->client->dev, "Got unsupported mode: %lu\n", state->mode); diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index 55dcea12082c..0b570f1b8bc8 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -55,6 +55,7 @@ struct typec_thunderbolt_data { /* TBT3 Device Enter Mode VDO bits */ #define TBT_ENTER_MODE_CABLE_SPEED(s) TBT_SET_CABLE_SPEED(s) +#define TBT_ENTER_MODE_UNI_DIR_LSRX BIT(23) #define TBT_ENTER_MODE_ACTIVE_CABLE BIT(24) #endif /* __USB_TYPEC_TBT_H */ -- cgit v1.2.3 From c88b6ee3ba3c7bf6386ea0e6de8111acc3d832bc Mon Sep 17 00:00:00 2001 From: Jingyi Wang Date: Wed, 24 Sep 2025 16:24:55 -0700 Subject: soc: qcom: llcc-qcom: Add support for Kaanapali Add system cache table and configs for Kaanapali SoC. Signed-off-by: Jingyi Wang Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250924-knp-llcc-v1-2-ae6a016e5138@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/llcc-qcom.c | 373 +++++++++++++++++++++++++++++++++++++ include/linux/soc/qcom/llcc-qcom.h | 7 + 2 files changed, 380 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c index 857ead56b37d..13e174267294 100644 --- a/drivers/soc/qcom/llcc-qcom.c +++ b/drivers/soc/qcom/llcc-qcom.c @@ -214,6 +214,364 @@ static const struct llcc_slice_config ipq5424_data[] = { }, }; +static const struct llcc_slice_config kaanapali_data[] = { + { + .usecase_id = LLCC_CPUSS, + .slice_id = 1, + .max_cap = 5120, + .priority = 1, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .write_scid_en = true, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDSC0, + .slice_id = 2, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_AUDIO, + .slice_id = 35, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMHPGRW, + .slice_id = 25, + .max_cap = 1024, + .priority = 5, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CMPT, + .slice_id = 34, + .max_cap = 4096, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_GPUHTW, + .slice_id = 11, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_GPU, + .slice_id = 9, + .max_cap = 5632, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .write_scid_cacheable_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MMUHWT, + .slice_id = 18, + .max_cap = 768, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_DISP, + .slice_id = 16, + .max_cap = 7168, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMHPFX, + .slice_id = 24, + .max_cap = 1024, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDMPNG, + .slice_id = 27, + .max_cap = 256, + .priority = 5, + .bonus_ways = 0xfffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CVP, + .slice_id = 8, + .max_cap = 800, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_MODPE, + .slice_id = 29, + .max_cap = 256, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xf0000000, + .mru_uncap_en = true, + .alloc_oneway_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_WRCACHE, + .slice_id = 31, + .max_cap = 512, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CVPFW, + .slice_id = 19, + .max_cap = 512, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CPUMTE, + .slice_id = 7, + .max_cap = 256, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CMPTHCP, + .slice_id = 15, + .max_cap = 256, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_LCPDARE, + .slice_id = 30, + .max_cap = 128, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .mru_uncap_en = true, + .alloc_oneway_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_AENPU, + .slice_id = 3, + .max_cap = 3072, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_ISLAND1, + .slice_id = 12, + .max_cap = 7936, + .priority = 7, + .fixed_size = true, + .bonus_ways = 0x7fffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_DISP_WB, + .slice_id = 23, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDVSP, + .slice_id = 4, + .max_cap = 256, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_VIDDEC, + .slice_id = 5, + .max_cap = 512, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .cache_mode = 2, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMOFE, + .slice_id = 33, + .max_cap = 6144, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMRTIP, + .slice_id = 13, + .max_cap = 6144, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMRTRF, + .slice_id = 10, + .max_cap = 3584, + .priority = 3, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAMSRTRF, + .slice_id = 21, + .max_cap = 6144, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .stale_en = true, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_VIDEO_APV, + .slice_id = 6, + .max_cap = 768, + .priority = 4, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_COMPUTE1, + .slice_id = 22, + .max_cap = 4096, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CPUSS_OPP, + .slice_id = 32, + .max_cap = 0, + .priority = 0, + .fixed_size = true, + .bonus_ways = 0, + .activate_on_init = true, + .write_scid_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CPUSSMPAM, + .slice_id = 17, + .max_cap = 2048, + .priority = 1, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .activate_on_init = true, + .write_scid_en = true, + .stale_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_CAM_IPE_STROV, + .slice_id = 14, + .max_cap = 400, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CAM_OFE_STROV, + .slice_id = 20, + .max_cap = 400, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xffffffff, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + .parent_slice_id = 33, + }, { + .usecase_id = LLCC_CPUSS_HEU, + .slice_id = 28, + .max_cap = 0, + .priority = 0, + .fixed_size = true, + .bonus_ways = 0, + .mru_uncap_en = true, + .ovcap_en = true, + .vict_prio = true, + }, { + .usecase_id = LLCC_MDM_PNG_FIXED, + .slice_id = 26, + .max_cap = 256, + .priority = 5, + .fixed_size = true, + .bonus_ways = 0xff000000, + .activate_on_init = true, + .write_scid_en = true, + .mru_uncap_en = true, + .vict_prio = true, + }, +}; + static const struct llcc_slice_config sa8775p_data[] = { { .usecase_id = LLCC_CPUSS, @@ -3505,6 +3863,15 @@ static const u32 llcc_v6_reg_offset[] = { [LLCC_TRP_WRS_CACHEABLE_EN] = 0x00042088, }; +static const struct qcom_llcc_config kaanapali_cfg[] = { + { + .sct_data = kaanapali_data, + .size = ARRAY_SIZE(kaanapali_data), + .reg_offset = llcc_v6_reg_offset, + .edac_reg_offset = &llcc_v6_edac_reg_offset, + }, +}; + static const struct qcom_llcc_config qcs615_cfg[] = { { .sct_data = qcs615_data, @@ -3731,6 +4098,11 @@ static const struct qcom_llcc_config x1e80100_cfg[] = { }, }; +static const struct qcom_sct_config kaanapali_cfgs = { + .llcc_config = kaanapali_cfg, + .num_config = ARRAY_SIZE(kaanapali_cfg), +}; + static const struct qcom_sct_config qcs615_cfgs = { .llcc_config = qcs615_cfg, .num_config = ARRAY_SIZE(qcs615_cfg), @@ -4570,6 +4942,7 @@ err: static const struct of_device_id qcom_llcc_of_match[] = { { .compatible = "qcom,ipq5424-llcc", .data = &ipq5424_cfgs}, + { .compatible = "qcom,kaanapali-llcc", .data = &kaanapali_cfgs}, { .compatible = "qcom,qcs615-llcc", .data = &qcs615_cfgs}, { .compatible = "qcom,qcs8300-llcc", .data = &qcs8300_cfgs}, { .compatible = "qcom,qdu1000-llcc", .data = &qdu1000_cfgs}, diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 7a69210a250c..0287f9182c4d 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -74,7 +74,14 @@ #define LLCC_CAMSRTIP 73 #define LLCC_CAMRTRF 74 #define LLCC_CAMSRTRF 75 +#define LLCC_VIDEO_APV 83 +#define LLCC_COMPUTE1 87 +#define LLCC_CPUSS_OPP 88 #define LLCC_CPUSSMPAM 89 +#define LLCC_CAM_IPE_STROV 92 +#define LLCC_CAM_OFE_STROV 93 +#define LLCC_CPUSS_HEU 94 +#define LLCC_MDM_PNG_FIXED 100 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From 7958b4bb806c1af800ca23c8333a98231b3ab0b1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 22 Oct 2025 15:23:42 +0200 Subject: pinctrl: pinmux: Add missing .function_is_gpio kerneldoc This callback was undocumented, add the docs. Reviewed-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinmux.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 6db6c3e1ccc2..094bbe2fd6fd 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -35,6 +35,16 @@ struct pinctrl_gpio_range; * name can be used with the generic @pinctrl_ops to retrieve the * actual pins affected. The applicable groups will be returned in * @groups and the number of groups in @num_groups + * @function_is_gpio: determine if the indicated function selector passed + * corresponds to the GPIO function which is used by the accelerated GPIO + * functions @gpio_request_enable, @gpio_disable_free and + * @gpio_set_direction. When the pin control core can properly determine + * if a function is a GPIO function, it is easier to use the @strict mode + * on the pin controller. Since a single function is passed, this is + * only useful on pin controllers that use a specific function for GPIO, + * and that usually presupposes that a one-group-per-pin approach is + * used, so that a single function can be set on a single pin to turn + * it to GPIO mode. * @set_mux: enable a certain muxing function with a certain pin group. The * driver does not need to figure out whether enabling this function * conflicts some other use of the pins in that group, such collisions -- cgit v1.2.3 From 245f14f5fe283c782b16143280f283bee29dbb5f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 30 Sep 2025 12:30:55 +0800 Subject: interconnect: Optimize kbps_to_icc() macro The current expansion of kbps_to_icc() introduces unnecessary logic when compiled from a general expression. Rewriting it allows compilers to emit shorter and more efficient code across architectures. For example, with gcc -O2: arm64: old: tst x0, 7 add w1, w0, 7 cset w2, ne cmp w0, 0 csel w0, w1, w0, lt add w0, w2, w0, asr 3 new: add w1, w0, 14 adds w0, w0, 7 csel w0, w1, w0, mi asr w0, w0, 3 x86-64: old: xor eax, eax test dil, 7 lea edx, [rdi+7] setne al test edi, edi cmovns edx, edi sar edx, 3 add eax, edx new: lea eax, [rdi+14] add edi, 7 cmovns eax, edi sar eax, 3 In both cases the old form relies on extra test and compare instructions (tst, test, cmp) combined with conditional moves or sets, while the new form uses fewer instructions by folding the addition and flag update together (adds on arm64, add on x86). This reduces the instruction sequence, prevents multiple evaluations of x when it is an expression or a function call, and keeps the macro simpler. Signed-off-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250930043055.2200322-1-visitorckw@gmail.com Signed-off-by: Georgi Djakov --- include/linux/interconnect.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index e4b8808823ad..4b12821528a6 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -16,7 +16,7 @@ #define MBps_to_icc(x) ((x) * 1000) #define GBps_to_icc(x) ((x) * 1000 * 1000) #define bps_to_icc(x) (1) -#define kbps_to_icc(x) ((x) / 8 + ((x) % 8 ? 1 : 0)) +#define kbps_to_icc(x) (((x) + 7) / 8) #define Mbps_to_icc(x) ((x) * 1000 / 8) #define Gbps_to_icc(x) ((x) * 1000 * 1000 / 8) -- cgit v1.2.3 From e30f8e61e2518a837837daa26cda3c8cc30f3226 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:40 -0400 Subject: tracing: Add a tracepoint verification check at build time If a tracepoint is defined via DECLARE_TRACE() or TRACE_EVENT() but never called (via the trace_() function), its metadata is still around in memory and not discarded. When created via TRACE_EVENT() the situation is worse because the TRACE_EVENT() creates metadata that can be around 5k per trace event. Having unused trace events causes several thousand of wasted bytes. Add a verifier that injects a string of the name of the tracepoint it calls that is added to the discarded section "__tracepoint_check". For every builtin tracepoint, its name (which is saved in the in-memory section "__tracepoint_strings") will have its name also in the "__tracepoint_check" section if it is used. Add a new program that is run on build called tracepoint-update. This is executed on the vmlinux.o before the __tracepoint_check section is discarded (the section is discarded before vmlinux is created). This program will create an array of each string in the __tracepoint_check section and then sort it. Then it will walk the strings in the __tracepoint_strings section and do a binary search to check if its name is in the __tracepoint_check section. If it is not, then it is unused and a warning is printed. Note, this currently only handles tracepoints that are builtin and not in modules. Enabling this currently with a given config produces: warning: tracepoint 'sched_move_numa' is unused. warning: tracepoint 'sched_stick_numa' is unused. warning: tracepoint 'sched_swap_numa' is unused. warning: tracepoint 'pelt_hw_tp' is unused. warning: tracepoint 'pelt_irq_tp' is unused. warning: tracepoint 'rcu_preempt_task' is unused. warning: tracepoint 'rcu_unlock_preempted_task' is unused. warning: tracepoint 'xdp_bulk_tx' is unused. warning: tracepoint 'xdp_redirect_map' is unused. warning: tracepoint 'xdp_redirect_map_err' is unused. warning: tracepoint 'vma_mas_szero' is unused. warning: tracepoint 'vma_store' is unused. warning: tracepoint 'hugepage_set_pmd' is unused. warning: tracepoint 'hugepage_set_pud' is unused. warning: tracepoint 'hugepage_update_pmd' is unused. warning: tracepoint 'hugepage_update_pud' is unused. warning: tracepoint 'block_rq_remap' is unused. warning: tracepoint 'xhci_dbc_handle_event' is unused. warning: tracepoint 'xhci_dbc_handle_transfer' is unused. warning: tracepoint 'xhci_dbc_gadget_ep_queue' is unused. warning: tracepoint 'xhci_dbc_alloc_request' is unused. warning: tracepoint 'xhci_dbc_free_request' is unused. warning: tracepoint 'xhci_dbc_queue_request' is unused. warning: tracepoint 'xhci_dbc_giveback_request' is unused. warning: tracepoint 'tcp_ao_wrong_maclen' is unused. warning: tracepoint 'tcp_ao_mismatch' is unused. warning: tracepoint 'tcp_ao_key_not_found' is unused. warning: tracepoint 'tcp_ao_rnext_request' is unused. warning: tracepoint 'tcp_ao_synack_no_key' is unused. warning: tracepoint 'tcp_ao_snd_sne_update' is unused. warning: tracepoint 'tcp_ao_rcv_sne_update' is unused. Some of the above is totally unused but others are not used due to their "trace_" functions being inside configs, in which case, the defined tracepoints should also be inside those same configs. Others are architecture specific but defined in generic code, where they should either be moved to the architecture or be surrounded by #ifdef for the architectures they are for. This tool could be updated to process modules in the future. I'd like to thank Mathieu Desnoyers for suggesting using strings instead of pointers, as using pointers in vmlinux.o required handling relocations and it required implementing almost a full feature linker to do so. To enable this check, run the build with: make UT=1 Note, when all the existing unused tracepoints are removed from the build, the "UT=1" will be removed and this will always be enabled when tracepoints are configured to warn on any new tracepoints. The reason this isn't always enabled now is because it will introduce a lot of warnings for the current unused tracepoints, and all bisects would end at this commit for those warnings. Link: https://lore.kernel.org/all/20250528114549.4d8a5e03@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004452.920728129@kernel.org Suggested-by: Mathieu Desnoyers # for using strings instead of pointers Signed-off-by: Steven Rostedt (Google) --- Makefile | 21 ++++ include/asm-generic/vmlinux.lds.h | 1 + include/linux/tracepoint.h | 11 ++ scripts/Makefile | 3 + scripts/link-vmlinux.sh | 7 ++ scripts/tracepoint-update.c | 232 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 275 insertions(+) create mode 100644 scripts/tracepoint-update.c (limited to 'include/linux') diff --git a/Makefile b/Makefile index d14824792227..9823c20c4278 100644 --- a/Makefile +++ b/Makefile @@ -810,6 +810,25 @@ ifdef CONFIG_FUNCTION_TRACER CC_FLAGS_FTRACE := -pg endif +ifdef CONFIG_TRACEPOINTS +# To check for unused tracepoints (tracepoints that are defined but never +# called), run with: +# +# make UT=1 +# +# Each unused tracepoints can take up to 5KB of memory in the running kernel. +# It is best to remove any that are not used. +# +# This command line option will be removed when all current unused +# tracepoints are removed. + +ifeq ("$(origin UT)", "command line") + WARN_ON_UNUSED_TRACEPOINTS := $(UT) +endif +endif # CONFIG_TRACEPOINTS + +export WARN_ON_UNUSED_TRACEPOINTS + include $(srctree)/arch/$(SRCARCH)/Makefile ifdef need-config @@ -1772,6 +1791,8 @@ help: @echo ' c: extra checks in the configuration stage (Kconfig)' @echo ' e: warnings are being treated as errors' @echo ' Multiple levels can be combined with W=12 or W=123' + @echo ' make UT=1 [targets] Warn if a tracepoint is defined but not used.' + @echo ' [ This will be removed when all current unused tracepoints are eliminated. ]' @$(if $(dtstree), \ echo ' make CHECK_DTBS=1 [targets] Check all generated dtb files against schema'; \ echo ' This can be applied both to "dtbs" and to individual "foo.dtb" targets' ; \ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..c510fb097a8c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1048,6 +1048,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.no_trim_symbol) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ + *(__tracepoint_check) \ #define DISCARDS \ /DISCARD/ : { \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 826ce3f8e1f8..1e53d3626c78 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -221,6 +221,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __do_trace_##name(args); \ } +/* + * When a tracepoint is used, it's name is added to the __tracepoint_check + * section. This section is only used at build time to make sure all + * defined tracepoints are used. It is discarded after the build. + */ +# define TRACEPOINT_CHECK(name) \ + static const char __used __section("__tracepoint_check") __trace_check[] = \ + #name; + /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -270,6 +279,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ if (cond) { \ guard(preempt_notrace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -289,6 +299,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ diff --git a/scripts/Makefile b/scripts/Makefile index f19624b3ed92..0941e5ce7b57 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -11,8 +11,10 @@ hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_builder hostprogs-always-$(CONFIG_RUST_KERNEL_DOCTESTS) += rustdoc_test_gen +hostprogs-always-$(CONFIG_TRACEPOINTS) += tracepoint-update sorttable-objs := sorttable.o elf-parse.o +tracepoint-update-objs := tracepoint-update.o elf-parse.o ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),) always-$(CONFIG_RUST) += target.json @@ -27,6 +29,7 @@ generate_rust_target-rust := y rustdoc_test_builder-rust := y rustdoc_test_gen-rust := y +HOSTCFLAGS_tracepoint-update.o = -I$(srctree)/tools/include HOSTCFLAGS_elf-parse.o = -I$(srctree)/tools/include HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include HOSTLDLIBS_sorttable = -lpthread diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 433849ff7529..d304029fa6da 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -208,6 +208,13 @@ kallsymso= strip_debug= generate_map= +# Use "make UT=1" to trigger warnings on unused tracepoints +case "${WARN_ON_UNUSED_TRACEPOINTS}" in +*1*) + ${objtree}/scripts/tracepoint-update vmlinux.o + ;; +esac + if is_enabled CONFIG_KALLSYMS; then true > .tmp_vmlinux0.syms kallsyms .tmp_vmlinux0.syms .tmp_vmlinux0.kallsyms diff --git a/scripts/tracepoint-update.c b/scripts/tracepoint-update.c new file mode 100644 index 000000000000..6ec30f39d0ad --- /dev/null +++ b/scripts/tracepoint-update.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "elf-parse.h" + +static Elf_Shdr *check_data_sec; +static Elf_Shdr *tracepoint_data_sec; + +static inline void *get_index(void *start, int entsize, int index) +{ + return start + (entsize * index); +} + +static int compare_strings(const void *a, const void *b) +{ + const char *av = *(const char **)a; + const char *bv = *(const char **)b; + + return strcmp(av, bv); +} + +struct elf_tracepoint { + Elf_Ehdr *ehdr; + const char **array; + int count; +}; + +#define REALLOC_SIZE (1 << 10) +#define REALLOC_MASK (REALLOC_SIZE - 1) + +static int add_string(const char *str, const char ***vals, int *count) +{ + const char **array = *vals; + + if (!(*count & REALLOC_MASK)) { + int size = (*count) + REALLOC_SIZE; + + array = realloc(array, sizeof(char *) * size); + if (!array) { + fprintf(stderr, "Failed memory allocation\n"); + return -1; + } + *vals = array; + } + + array[(*count)++] = str; + return 0; +} + +/** + * for_each_shdr_str - iterator that reads strings that are in an ELF section. + * @len: "int" to hold the length of the current string + * @ehdr: A pointer to the ehdr of the ELF file + * @sec: The section that has the strings to iterate on + * + * This is a for loop that iterates over all the nul terminated strings + * that are in a given ELF section. The variable "str" will hold + * the current string for each iteration and the passed in @len will + * contain the strlen() of that string. + */ +#define for_each_shdr_str(len, ehdr, sec) \ + for (const char *str = (void *)(ehdr) + shdr_offset(sec), \ + *end = str + shdr_size(sec); \ + len = strlen(str), str < end; \ + str += (len) + 1) + + +static void make_trace_array(struct elf_tracepoint *etrace) +{ + Elf_Ehdr *ehdr = etrace->ehdr; + const char **vals = NULL; + int count = 0; + int len; + + etrace->array = NULL; + + /* + * The __tracepoint_check section is filled with strings of the + * names of tracepoints (in tracepoint_strings). Create an array + * that points to each string and then sort the array. + */ + for_each_shdr_str(len, ehdr, check_data_sec) { + if (!len) + continue; + if (add_string(str, &vals, &count) < 0) + return; + } + + /* If CONFIG_TRACEPOINT_VERIFY_USED is not set, there's nothing to do */ + if (!count) + return; + + qsort(vals, count, sizeof(char *), compare_strings); + + etrace->array = vals; + etrace->count = count; +} + +static int find_event(const char *str, void *array, size_t size) +{ + return bsearch(&str, array, size, sizeof(char *), compare_strings) != NULL; +} + +static void check_tracepoints(struct elf_tracepoint *etrace) +{ + Elf_Ehdr *ehdr = etrace->ehdr; + int len; + + if (!etrace->array) + return; + + /* + * The __tracepoints_strings section holds all the names of the + * defined tracepoints. If any of them are not in the + * __tracepoint_check_section it means they are not used. + */ + for_each_shdr_str(len, ehdr, tracepoint_data_sec) { + if (!len) + continue; + if (!find_event(str, etrace->array, etrace->count)) { + fprintf(stderr, "warning: tracepoint '%s' is unused.\n", str); + } + } + + free(etrace->array); +} + +static void *tracepoint_check(struct elf_tracepoint *etrace) +{ + make_trace_array(etrace); + check_tracepoints(etrace); + + return NULL; +} + +static int process_tracepoints(void *addr, char const *const fname) +{ + struct elf_tracepoint etrace = {0}; + Elf_Ehdr *ehdr = addr; + Elf_Shdr *shdr_start; + Elf_Shdr *string_sec; + const char *secstrings; + unsigned int shnum; + unsigned int shstrndx; + int shentsize; + int idx; + int done = 2; + + shdr_start = (Elf_Shdr *)((char *)ehdr + ehdr_shoff(ehdr)); + shentsize = ehdr_shentsize(ehdr); + + shstrndx = ehdr_shstrndx(ehdr); + if (shstrndx == SHN_XINDEX) + shstrndx = shdr_link(shdr_start); + string_sec = get_index(shdr_start, shentsize, shstrndx); + secstrings = (const char *)ehdr + shdr_offset(string_sec); + + shnum = ehdr_shnum(ehdr); + if (shnum == SHN_UNDEF) + shnum = shdr_size(shdr_start); + + for (int i = 0; done && i < shnum; i++) { + Elf_Shdr *shdr = get_index(shdr_start, shentsize, i); + + idx = shdr_name(shdr); + + /* locate the __tracepoint_check in vmlinux */ + if (!strcmp(secstrings + idx, "__tracepoint_check")) { + check_data_sec = shdr; + done--; + } + + /* locate the __tracepoints_ptrs section in vmlinux */ + if (!strcmp(secstrings + idx, "__tracepoints_strings")) { + tracepoint_data_sec = shdr; + done--; + } + } + + if (!check_data_sec) { + fprintf(stderr, "no __tracepoint_check in file: %s\n", fname); + return -1; + } + + if (!tracepoint_data_sec) { + fprintf(stderr, "no __tracepoint_strings in file: %s\n", fname); + return -1; + } + + etrace.ehdr = ehdr; + tracepoint_check(&etrace); + return 0; +} + +int main(int argc, char *argv[]) +{ + int n_error = 0; + size_t size = 0; + void *addr = NULL; + + if (argc < 2) { + fprintf(stderr, "usage: tracepoint-update vmlinux...\n"); + return 0; + } + + /* Process each file in turn, allowing deep failure. */ + for (int i = 1; i < argc; i++) { + addr = elf_map(argv[i], &size, 1 << ET_REL); + if (!addr) { + ++n_error; + continue; + } + + if (process_tracepoints(addr, argv[i])) + ++n_error; + + elf_unmap(addr, size); + } + + return !!n_error; +} -- cgit v1.2.3 From faf938153cad98d97f60ac835ead1db74961507e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:41 -0400 Subject: tracepoint: Do not warn for unused event that is exported There are a few generic events that may only be used by modules. They are defined and then set with EXPORT_TRACEPOINT*(). Mark events that are exported as being used, even though they still waste memory in the kernel proper. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004453.089254920@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 1e53d3626c78..8a56f3278b1b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -227,8 +227,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * defined tracepoints are used. It is discarded after the build. */ # define TRACEPOINT_CHECK(name) \ - static const char __used __section("__tracepoint_check") __trace_check[] = \ - #name; + static const char __used __section("__tracepoint_check") \ + __trace_check_##name[] = #name; /* * Make sure the alignment of the structure in the __tracepoints section will @@ -382,10 +382,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ EXPORT_SYMBOL_GPL(__traceiter_##name); \ EXPORT_STATIC_CALL_GPL(tp_func_##name) #define EXPORT_TRACEPOINT_SYMBOL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL(__tracepoint_##name); \ EXPORT_SYMBOL(__traceiter_##name); \ EXPORT_STATIC_CALL(tp_func_##name) -- cgit v1.2.3 From f864e4b721e386be132cc973eadefe5d52cdfd94 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 15 Oct 2025 20:26:07 +0100 Subject: clk: renesas: rzv2h: Add support for DSI clocks Add support for PLLDSI and its post-dividers in the RZ/V2H CPG driver and export helper APIs for use by the DSI driver. Introduce per-PLL-DSI state in the CPG private structure and provide a set of helper functions that find valid PLL parameter combinations for a requested frequency. The new helpers are rzv2h_get_pll_pars(), rzv2h_get_pll_div_pars(), rzv2h_get_pll_divs_pars() and rzv2h_get_pll_dtable_pars() and they are exported in the "RZV2H_CPG" namespace for use by other consumers (notably the DSI driver). These helpers perform iterative searches over PLL parameters (M, K, P, S) and optional post-dividers and return the best match (or an exact match when possible). Move PLL/CLK related limits and parameter types into the shared include (include/linux/clk/renesas.h) by adding struct rzv2h_pll_limits, struct rzv2h_pll_pars and struct rzv2h_pll_div_pars plus the RZV2H_CPG_PLL_DSI_LIMITS() helper macro to define DSI PLL limits. This change centralises the PLLDSI algorithms so the CPG and DSI drivers compute PLL parameters consistently and allows the DSI driver to accurately request rates and program its PLL. Co-developed-by: Fabrizio Castro Signed-off-by: Fabrizio Castro Signed-off-by: Lad Prabhakar Acked-by: Tomi Valkeinen Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251015192611.241920-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/clk/renesas/rzv2h-cpg.c | 497 ++++++++++++++++++++++++++++++++++++++++ drivers/clk/renesas/rzv2h-cpg.h | 19 +- include/linux/clk/renesas.h | 145 ++++++++++++ 3 files changed, 659 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/renesas/rzv2h-cpg.c b/drivers/clk/renesas/rzv2h-cpg.c index 6abac15d3475..182437800329 100644 --- a/drivers/clk/renesas/rzv2h-cpg.c +++ b/drivers/clk/renesas/rzv2h-cpg.c @@ -14,9 +14,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include #include #include #include @@ -26,6 +31,7 @@ #include #include #include +#include #include @@ -47,7 +53,9 @@ #define CPG_PLL_STBY(x) ((x)) #define CPG_PLL_STBY_RESETB BIT(0) +#define CPG_PLL_STBY_SSC_EN BIT(2) #define CPG_PLL_STBY_RESETB_WEN BIT(16) +#define CPG_PLL_STBY_SSC_EN_WEN BIT(18) #define CPG_PLL_CLK1(x) ((x) + 0x004) #define CPG_PLL_CLK1_KDIV GENMASK(31, 16) #define CPG_PLL_CLK1_MDIV GENMASK(15, 6) @@ -65,6 +73,22 @@ #define CPG_CLKSTATUS0 (0x700) +/* On RZ/G3E SoC we have two DSI PLLs */ +#define MAX_CPG_DSI_PLL 2 + +/** + * struct rzv2h_pll_dsi_info - PLL DSI information, holds the limits and parameters + * + * @pll_dsi_limits: PLL DSI parameters limits + * @pll_dsi_parameters: Calculated PLL DSI parameters + * @req_pll_dsi_rate: Requested PLL DSI rate + */ +struct rzv2h_pll_dsi_info { + const struct rzv2h_pll_limits *pll_dsi_limits; + struct rzv2h_pll_div_pars pll_dsi_parameters; + unsigned long req_pll_dsi_rate; +}; + /** * struct rzv2h_cpg_priv - Clock Pulse Generator Private Data * @@ -80,6 +104,7 @@ * @ff_mod_status_ops: Fixed Factor Module Status Clock operations * @mstop_count: Array of mstop values * @rcdev: Reset controller entity + * @pll_dsi_info: Array of PLL DSI information, holds the limits and parameters */ struct rzv2h_cpg_priv { struct device *dev; @@ -98,6 +123,8 @@ struct rzv2h_cpg_priv { atomic_t *mstop_count; struct reset_controller_dev rcdev; + + struct rzv2h_pll_dsi_info pll_dsi_info[MAX_CPG_DSI_PLL]; }; #define rcdev_to_priv(x) container_of(x, struct rzv2h_cpg_priv, rcdev) @@ -168,6 +195,460 @@ struct rzv2h_ff_mod_status_clk { #define to_rzv2h_ff_mod_status_clk(_hw) \ container_of(_hw, struct rzv2h_ff_mod_status_clk, fix.hw) +/** + * struct rzv2h_plldsi_div_clk - PLL DSI DDIV clock + * + * @dtable: divider table + * @priv: CPG private data + * @hw: divider clk + * @ddiv: divider configuration + */ +struct rzv2h_plldsi_div_clk { + const struct clk_div_table *dtable; + struct rzv2h_cpg_priv *priv; + struct clk_hw hw; + struct ddiv ddiv; +}; + +#define to_plldsi_div_clk(_hw) \ + container_of(_hw, struct rzv2h_plldsi_div_clk, hw) + +#define RZ_V2H_OSC_CLK_IN_MEGA (24 * MEGA) +#define RZV2H_MAX_DIV_TABLES (16) + +/** + * rzv2h_get_pll_pars - Finds the best combination of PLL parameters + * for a given frequency. + * + * @limits: Pointer to the structure containing the limits for the PLL parameters + * @pars: Pointer to the structure where the best calculated PLL parameters values + * will be stored + * @freq_millihz: Target output frequency in millihertz + * + * This function calculates the best set of PLL parameters (M, K, P, S) to achieve + * the desired frequency. + * There is no direct formula to calculate the PLL parameters, as it's an open + * system of equations, therefore this function uses an iterative approach to + * determine the best solution. The best solution is one that minimizes the error + * (desired frequency - actual frequency). + * + * Return: true if a valid set of parameters values is found, false otherwise. + */ +bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, u64 freq_millihz) +{ + u64 fout_min_millihz = mul_u32_u32(limits->fout.min, MILLI); + u64 fout_max_millihz = mul_u32_u32(limits->fout.max, MILLI); + struct rzv2h_pll_pars p, best; + + if (freq_millihz > fout_max_millihz || + freq_millihz < fout_min_millihz) + return false; + + /* Initialize best error to maximum possible value */ + best.error_millihz = S64_MAX; + + for (p.p = limits->p.min; p.p <= limits->p.max; p.p++) { + u32 fref = RZ_V2H_OSC_CLK_IN_MEGA / p.p; + u16 divider; + + for (divider = 1 << limits->s.min, p.s = limits->s.min; + p.s <= limits->s.max; p.s++, divider <<= 1) { + for (p.m = limits->m.min; p.m <= limits->m.max; p.m++) { + u64 output_m, output_k_range; + s64 pll_k, output_k; + u64 fvco, output; + + /* + * The frequency generated by the PLL + divider + * is calculated as follows: + * + * With: + * Freq = Ffout = Ffvco / 2^(pll_s) + * Ffvco = (pll_m + (pll_k / 65536)) * Ffref + * Ffref = 24MHz / pll_p + * + * Freq can also be rewritten as: + * Freq = Ffvco / 2^(pll_s) + * = ((pll_m + (pll_k / 65536)) * Ffref) / 2^(pll_s) + * = (pll_m * Ffref) / 2^(pll_s) + ((pll_k / 65536) * Ffref) / 2^(pll_s) + * = output_m + output_k + * + * Every parameter has been determined at this + * point, but pll_k. + * + * Considering that: + * limits->k.min <= pll_k <= limits->k.max + * Then: + * -0.5 <= (pll_k / 65536) < 0.5 + * Therefore: + * -Ffref / (2 * 2^(pll_s)) <= output_k < Ffref / (2 * 2^(pll_s)) + */ + + /* Compute output M component (in mHz) */ + output_m = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(p.m, fref) * MILLI, + divider); + /* Compute range for output K (in mHz) */ + output_k_range = DIV_ROUND_CLOSEST_ULL(mul_u32_u32(fref, MILLI), + 2 * divider); + /* + * No point in continuing if we can't achieve + * the desired frequency + */ + if (freq_millihz < (output_m - output_k_range) || + freq_millihz >= (output_m + output_k_range)) { + continue; + } + + /* + * Compute the K component + * + * Since: + * Freq = output_m + output_k + * Then: + * output_k = Freq - output_m + * = ((pll_k / 65536) * Ffref) / 2^(pll_s) + * Therefore: + * pll_k = (output_k * 65536 * 2^(pll_s)) / Ffref + */ + output_k = freq_millihz - output_m; + pll_k = div_s64(output_k * 65536ULL * divider, + fref); + pll_k = DIV_S64_ROUND_CLOSEST(pll_k, MILLI); + + /* Validate K value within allowed limits */ + if (pll_k < limits->k.min || + pll_k > limits->k.max) + continue; + + p.k = pll_k; + + /* Compute (Ffvco * 65536) */ + fvco = mul_u32_u32(p.m * 65536 + p.k, fref); + if (fvco < mul_u32_u32(limits->fvco.min, 65536) || + fvco > mul_u32_u32(limits->fvco.max, 65536)) + continue; + + /* PLL_M component of (output * 65536 * PLL_P) */ + output = mul_u32_u32(p.m * 65536, RZ_V2H_OSC_CLK_IN_MEGA); + /* PLL_K component of (output * 65536 * PLL_P) */ + output += p.k * RZ_V2H_OSC_CLK_IN_MEGA; + /* Make it in mHz */ + output *= MILLI; + output = DIV_U64_ROUND_CLOSEST(output, 65536 * p.p * divider); + + /* Check output frequency against limits */ + if (output < fout_min_millihz || + output > fout_max_millihz) + continue; + + p.error_millihz = freq_millihz - output; + p.freq_millihz = output; + + /* If an exact match is found, return immediately */ + if (p.error_millihz == 0) { + *pars = p; + return true; + } + + /* Update best match if error is smaller */ + if (abs(best.error_millihz) > abs(p.error_millihz)) + best = p; + } + } + } + + /* If no valid parameters were found, return false */ + if (best.error_millihz == S64_MAX) + return false; + + *pars = best; + return true; +} +EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_pars, "RZV2H_CPG"); + +/* + * rzv2h_get_pll_divs_pars - Finds the best combination of PLL parameters + * and divider value for a given frequency. + * + * @limits: Pointer to the structure containing the limits for the PLL parameters + * @pars: Pointer to the structure where the best calculated PLL parameters and + * divider values will be stored + * @table: Pointer to the array of valid divider values + * @table_size: Size of the divider values array + * @freq_millihz: Target output frequency in millihertz + * + * This function calculates the best set of PLL parameters (M, K, P, S) and divider + * value to achieve the desired frequency. See rzv2h_get_pll_pars() for more details + * on how the PLL parameters are calculated. + * + * freq_millihz is the desired frequency generated by the PLL followed by a + * a gear. + */ +bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, u64 freq_millihz) +{ + struct rzv2h_pll_div_pars p, best; + + best.div.error_millihz = S64_MAX; + p.div.error_millihz = S64_MAX; + for (unsigned int i = 0; i < table_size; i++) { + if (!rzv2h_get_pll_pars(limits, &p.pll, freq_millihz * table[i])) + continue; + + p.div.divider_value = table[i]; + p.div.freq_millihz = DIV_U64_ROUND_CLOSEST(p.pll.freq_millihz, table[i]); + p.div.error_millihz = freq_millihz - p.div.freq_millihz; + + if (p.div.error_millihz == 0) { + *pars = p; + return true; + } + + if (abs(best.div.error_millihz) > abs(p.div.error_millihz)) + best = p; + } + + if (best.div.error_millihz == S64_MAX) + return false; + + *pars = best; + return true; +} +EXPORT_SYMBOL_NS_GPL(rzv2h_get_pll_divs_pars, "RZV2H_CPG"); + +static unsigned long rzv2h_cpg_plldsi_div_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + struct ddiv ddiv = dsi_div->ddiv; + u32 div; + + div = readl(priv->base + ddiv.offset); + div >>= ddiv.shift; + div &= clk_div_mask(ddiv.width); + div = dsi_div->dtable[div].div; + + return DIV_ROUND_CLOSEST_ULL(parent_rate, div); +} + +static int rzv2h_cpg_plldsi_div_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw)); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + u8 table[RZV2H_MAX_DIV_TABLES] = { 0 }; + struct rzv2h_pll_div_pars *dsi_params; + struct rzv2h_pll_dsi_info *dsi_info; + const struct clk_div_table *div; + unsigned int i = 0; + u64 rate_millihz; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + dsi_params = &dsi_info->pll_dsi_parameters; + + rate_millihz = mul_u32_u32(req->rate, MILLI); + if (rate_millihz == dsi_params->div.error_millihz + dsi_params->div.freq_millihz) + goto exit_determine_rate; + + for (div = dsi_div->dtable; div->div; div++) { + if (i >= RZV2H_MAX_DIV_TABLES) + return -EINVAL; + table[i++] = div->div; + } + + if (!rzv2h_get_pll_divs_pars(dsi_info->pll_dsi_limits, dsi_params, table, i, + rate_millihz)) { + dev_err(priv->dev, "failed to determine rate for req->rate: %lu\n", + req->rate); + return -EINVAL; + } + +exit_determine_rate: + req->rate = DIV_ROUND_CLOSEST_ULL(dsi_params->div.freq_millihz, MILLI); + req->best_parent_rate = req->rate * dsi_params->div.divider_value; + dsi_info->req_pll_dsi_rate = req->best_parent_rate; + + return 0; +} + +static int rzv2h_cpg_plldsi_div_set_rate(struct clk_hw *hw, + unsigned long rate, + unsigned long parent_rate) +{ + struct rzv2h_plldsi_div_clk *dsi_div = to_plldsi_div_clk(hw); + struct pll_clk *pll_clk = to_pll(clk_hw_get_parent(hw)); + struct rzv2h_cpg_priv *priv = dsi_div->priv; + struct rzv2h_pll_div_pars *dsi_params; + struct rzv2h_pll_dsi_info *dsi_info; + struct ddiv ddiv = dsi_div->ddiv; + const struct clk_div_table *clkt; + bool divider_found = false; + u32 val, shift; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + dsi_params = &dsi_info->pll_dsi_parameters; + + for (clkt = dsi_div->dtable; clkt->div; clkt++) { + if (clkt->div == dsi_params->div.divider_value) { + divider_found = true; + break; + } + } + + if (!divider_found) + return -EINVAL; + + shift = ddiv.shift; + val = readl(priv->base + ddiv.offset) | DDIV_DIVCTL_WEN(shift); + val &= ~(clk_div_mask(ddiv.width) << shift); + val |= clkt->val << shift; + writel(val, priv->base + ddiv.offset); + + return 0; +} + +static const struct clk_ops rzv2h_cpg_plldsi_div_ops = { + .recalc_rate = rzv2h_cpg_plldsi_div_recalc_rate, + .determine_rate = rzv2h_cpg_plldsi_div_determine_rate, + .set_rate = rzv2h_cpg_plldsi_div_set_rate, +}; + +static struct clk * __init +rzv2h_cpg_plldsi_div_clk_register(const struct cpg_core_clk *core, + struct rzv2h_cpg_priv *priv) +{ + struct rzv2h_plldsi_div_clk *clk_hw_data; + struct clk **clks = priv->clks; + struct clk_init_data init; + const struct clk *parent; + const char *parent_name; + struct clk_hw *clk_hw; + int ret; + + parent = clks[core->parent]; + if (IS_ERR(parent)) + return ERR_CAST(parent); + + clk_hw_data = devm_kzalloc(priv->dev, sizeof(*clk_hw_data), GFP_KERNEL); + if (!clk_hw_data) + return ERR_PTR(-ENOMEM); + + clk_hw_data->priv = priv; + clk_hw_data->ddiv = core->cfg.ddiv; + clk_hw_data->dtable = core->dtable; + + parent_name = __clk_get_name(parent); + init.name = core->name; + init.ops = &rzv2h_cpg_plldsi_div_ops; + init.flags = core->flag; + init.parent_names = &parent_name; + init.num_parents = 1; + + clk_hw = &clk_hw_data->hw; + clk_hw->init = &init; + + ret = devm_clk_hw_register(priv->dev, clk_hw); + if (ret) + return ERR_PTR(ret); + + return clk_hw->clk; +} + +static int rzv2h_cpg_plldsi_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct pll_clk *pll_clk = to_pll(hw); + struct rzv2h_cpg_priv *priv = pll_clk->priv; + struct rzv2h_pll_dsi_info *dsi_info; + u64 rate_millihz; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + /* check if the divider has already invoked the algorithm */ + if (req->rate == dsi_info->req_pll_dsi_rate) + return 0; + + /* If the req->rate doesn't match we do the calculation assuming there is no divider */ + rate_millihz = mul_u32_u32(req->rate, MILLI); + if (!rzv2h_get_pll_pars(dsi_info->pll_dsi_limits, + &dsi_info->pll_dsi_parameters.pll, rate_millihz)) { + dev_err(priv->dev, + "failed to determine rate for req->rate: %lu\n", + req->rate); + return -EINVAL; + } + + req->rate = DIV_ROUND_CLOSEST_ULL(dsi_info->pll_dsi_parameters.pll.freq_millihz, MILLI); + dsi_info->req_pll_dsi_rate = req->rate; + + return 0; +} + +static int rzv2h_cpg_pll_set_rate(struct pll_clk *pll_clk, + struct rzv2h_pll_pars *params, + bool ssc_disable) +{ + struct rzv2h_cpg_priv *priv = pll_clk->priv; + u16 offset = pll_clk->pll.offset; + u32 val; + int ret; + + /* Put PLL into standby mode */ + writel(CPG_PLL_STBY_RESETB_WEN, priv->base + CPG_PLL_STBY(offset)); + ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset), + val, !(val & CPG_PLL_MON_LOCK), + 100, 2000); + if (ret) { + dev_err(priv->dev, "Failed to put PLLDSI into standby mode"); + return ret; + } + + /* Output clock setting 1 */ + writel(FIELD_PREP(CPG_PLL_CLK1_KDIV, (u16)params->k) | + FIELD_PREP(CPG_PLL_CLK1_MDIV, params->m) | + FIELD_PREP(CPG_PLL_CLK1_PDIV, params->p), + priv->base + CPG_PLL_CLK1(offset)); + + /* Output clock setting 2 */ + val = readl(priv->base + CPG_PLL_CLK2(offset)); + writel((val & ~CPG_PLL_CLK2_SDIV) | FIELD_PREP(CPG_PLL_CLK2_SDIV, params->s), + priv->base + CPG_PLL_CLK2(offset)); + + /* Put PLL to normal mode */ + if (ssc_disable) + val = CPG_PLL_STBY_SSC_EN_WEN; + else + val = CPG_PLL_STBY_SSC_EN_WEN | CPG_PLL_STBY_SSC_EN; + writel(val | CPG_PLL_STBY_RESETB_WEN | CPG_PLL_STBY_RESETB, + priv->base + CPG_PLL_STBY(offset)); + + /* PLL normal mode transition, output clock stability check */ + ret = readl_poll_timeout_atomic(priv->base + CPG_PLL_MON(offset), + val, (val & CPG_PLL_MON_LOCK), + 100, 2000); + if (ret) { + dev_err(priv->dev, "Failed to put PLLDSI into normal mode"); + return ret; + } + + return 0; +} + +static int rzv2h_cpg_plldsi_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct pll_clk *pll_clk = to_pll(hw); + struct rzv2h_pll_dsi_info *dsi_info; + struct rzv2h_cpg_priv *priv = pll_clk->priv; + + dsi_info = &priv->pll_dsi_info[pll_clk->pll.instance]; + + return rzv2h_cpg_pll_set_rate(pll_clk, &dsi_info->pll_dsi_parameters.pll, true); +} + static int rzv2h_cpg_pll_clk_is_enabled(struct clk_hw *hw) { struct pll_clk *pll_clk = to_pll(hw); @@ -238,6 +719,12 @@ static unsigned long rzv2h_cpg_pll_clk_recalc_rate(struct clk_hw *hw, return DIV_ROUND_CLOSEST_ULL(rate, FIELD_GET(CPG_PLL_CLK1_PDIV, clk1)); } +static const struct clk_ops rzv2h_cpg_plldsi_ops = { + .recalc_rate = rzv2h_cpg_pll_clk_recalc_rate, + .determine_rate = rzv2h_cpg_plldsi_determine_rate, + .set_rate = rzv2h_cpg_plldsi_set_rate, +}; + static const struct clk_ops rzv2h_cpg_pll_ops = { .is_enabled = rzv2h_cpg_pll_clk_is_enabled, .enable = rzv2h_cpg_pll_clk_enable, @@ -264,6 +751,10 @@ rzv2h_cpg_pll_clk_register(const struct cpg_core_clk *core, if (!pll_clk) return ERR_PTR(-ENOMEM); + if (core->type == CLK_TYPE_PLLDSI) + priv->pll_dsi_info[core->cfg.pll.instance].pll_dsi_limits = + core->cfg.pll.limits; + parent_name = __clk_get_name(parent); init.name = core->name; init.ops = ops; @@ -588,6 +1079,12 @@ rzv2h_cpg_register_core_clk(const struct cpg_core_clk *core, case CLK_TYPE_SMUX: clk = rzv2h_cpg_mux_clk_register(core, priv); break; + case CLK_TYPE_PLLDSI: + clk = rzv2h_cpg_pll_clk_register(core, priv, &rzv2h_cpg_plldsi_ops); + break; + case CLK_TYPE_PLLDSI_DIV: + clk = rzv2h_cpg_plldsi_div_clk_register(core, priv); + break; default: goto fail; } diff --git a/drivers/clk/renesas/rzv2h-cpg.h b/drivers/clk/renesas/rzv2h-cpg.h index e2053049c299..637803bc1e89 100644 --- a/drivers/clk/renesas/rzv2h-cpg.h +++ b/drivers/clk/renesas/rzv2h-cpg.h @@ -22,15 +22,20 @@ struct pll { unsigned int offset:9; unsigned int has_clkn:1; unsigned int instance:2; + const struct rzv2h_pll_limits *limits; }; -#define PLL_PACK(_offset, _has_clkn, _instance) \ +#define PLL_PACK_LIMITS(_offset, _has_clkn, _instance, _limits) \ ((struct pll){ \ .offset = _offset, \ .has_clkn = _has_clkn, \ - .instance = _instance \ + .instance = _instance, \ + .limits = _limits \ }) +#define PLL_PACK(_offset, _has_clkn, _instance) \ + PLL_PACK_LIMITS(_offset, _has_clkn, _instance, NULL) + #define PLLCA55 PLL_PACK(0x60, 1, 0) #define PLLGPU PLL_PACK(0x120, 1, 0) @@ -191,6 +196,8 @@ enum clk_types { CLK_TYPE_PLL, CLK_TYPE_DDIV, /* Dynamic Switching Divider */ CLK_TYPE_SMUX, /* Static Mux */ + CLK_TYPE_PLLDSI, /* PLLDSI */ + CLK_TYPE_PLLDSI_DIV, /* PLLDSI divider */ }; #define DEF_TYPE(_name, _id, _type...) \ @@ -221,6 +228,14 @@ enum clk_types { .num_parents = ARRAY_SIZE(_parent_names), \ .flag = CLK_SET_RATE_PARENT, \ .mux_flags = CLK_MUX_HIWORD_MASK) +#define DEF_PLLDSI(_name, _id, _parent, _pll_packed) \ + DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI, .parent = _parent, .cfg.pll = _pll_packed) +#define DEF_PLLDSI_DIV(_name, _id, _parent, _ddiv_packed, _dtable) \ + DEF_TYPE(_name, _id, CLK_TYPE_PLLDSI_DIV, \ + .cfg.ddiv = _ddiv_packed, \ + .dtable = _dtable, \ + .parent = _parent, \ + .flag = CLK_SET_RATE_PARENT) /** * struct rzv2h_mod_clk - Module Clocks definitions diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h index 0ebbe2f0b45e..69d8159deee3 100644 --- a/include/linux/clk/renesas.h +++ b/include/linux/clk/renesas.h @@ -10,7 +10,9 @@ #ifndef __LINUX_CLK_RENESAS_H_ #define __LINUX_CLK_RENESAS_H_ +#include #include +#include struct device; struct device_node; @@ -32,4 +34,147 @@ void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev); #define cpg_mssr_attach_dev NULL #define cpg_mssr_detach_dev NULL #endif + +/** + * struct rzv2h_pll_limits - PLL parameter constraints + * + * This structure defines the minimum and maximum allowed values for + * various parameters used to configure a PLL. These limits ensure + * the PLL operates within valid and stable ranges. + * + * @fout: Output frequency range (in MHz) + * @fout.min: Minimum allowed output frequency + * @fout.max: Maximum allowed output frequency + * + * @fvco: PLL oscillation frequency range (in MHz) + * @fvco.min: Minimum allowed VCO frequency + * @fvco.max: Maximum allowed VCO frequency + * + * @m: Main-divider range + * @m.min: Minimum main-divider value + * @m.max: Maximum main-divider value + * + * @p: Pre-divider range + * @p.min: Minimum pre-divider value + * @p.max: Maximum pre-divider value + * + * @s: Divider range + * @s.min: Minimum divider value + * @s.max: Maximum divider value + * + * @k: Delta-sigma modulator range (signed) + * @k.min: Minimum delta-sigma value + * @k.max: Maximum delta-sigma value + */ +struct rzv2h_pll_limits { + struct { + u32 min; + u32 max; + } fout; + + struct { + u32 min; + u32 max; + } fvco; + + struct { + u16 min; + u16 max; + } m; + + struct { + u8 min; + u8 max; + } p; + + struct { + u8 min; + u8 max; + } s; + + struct { + s16 min; + s16 max; + } k; +}; + +/** + * struct rzv2h_pll_pars - PLL configuration parameters + * + * This structure contains the configuration parameters for the + * Phase-Locked Loop (PLL), used to achieve a specific output frequency. + * + * @m: Main divider value + * @p: Pre-divider value + * @s: Output divider value + * @k: Delta-sigma modulation value + * @freq_millihz: Calculated PLL output frequency in millihertz + * @error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_pars { + u16 m; + u8 p; + u8 s; + s16 k; + u64 freq_millihz; + s64 error_millihz; +}; + +/** + * struct rzv2h_pll_div_pars - PLL parameters with post-divider + * + * This structure is used for PLLs that include an additional post-divider + * stage after the main PLL block. It contains both the PLL configuration + * parameters and the resulting frequency/error values after the divider. + * + * @pll: Main PLL configuration parameters (see struct rzv2h_pll_pars) + * + * @div: Post-divider configuration and result + * @div.divider_value: Divider applied to the PLL output + * @div.freq_millihz: Output frequency after divider in millihertz + * @div.error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_div_pars { + struct rzv2h_pll_pars pll; + struct { + u8 divider_value; + u64 freq_millihz; + s64 error_millihz; + } div; +}; + +#define RZV2H_CPG_PLL_DSI_LIMITS(name) \ + static const struct rzv2h_pll_limits (name) = { \ + .fout = { .min = 25 * MEGA, .max = 375 * MEGA }, \ + .fvco = { .min = 1600 * MEGA, .max = 3200 * MEGA }, \ + .m = { .min = 64, .max = 533 }, \ + .p = { .min = 1, .max = 4 }, \ + .s = { .min = 0, .max = 6 }, \ + .k = { .min = -32768, .max = 32767 }, \ + } \ + +#ifdef CONFIG_CLK_RZV2H +bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, u64 freq_millihz); + +bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, u64 freq_millihz); +#else +static inline bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, + u64 freq_millihz) +{ + return false; +} + +static inline bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, + u64 freq_millihz) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From fd714986e4e46effa6697b13d32918fc59608ccb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Oct 2025 19:21:09 -0700 Subject: iommu: Pass in old domain to attach_dev callback functions The IOMMU core attaches each device to a default domain on probe(). Then, every new "attach" operation has a fundamental meaning of two-fold: - detach from its currently attached (old) domain - attach to a given new domain Modern IOMMU drivers following this pattern usually want to clean up the things related to the old domain, so they call iommu_get_domain_for_dev() to fetch the old domain. Pass in the old domain pointer from the core to drivers, aligning with the set_dev_pasid op that does so already. Ensure all low-level attach fcuntions in the core can forward the correct old domain pointer. Thus, rework those functions as well. Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- arch/powerpc/kernel/iommu.c | 5 +++-- drivers/iommu/amd/iommu.c | 11 +++++----- drivers/iommu/apple-dart.c | 9 +++++--- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 5 +++-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 +++++++++++++-------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 9 +++++--- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 11 +++++----- drivers/iommu/exynos-iommu.c | 8 ++++--- drivers/iommu/fsl_pamu_domain.c | 12 +++++------ drivers/iommu/intel/iommu.c | 10 ++++++--- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/iommu.c | 25 +++++++++++++--------- drivers/iommu/iommufd/selftest.c | 2 +- drivers/iommu/ipmmu-vmsa.c | 10 ++++----- drivers/iommu/msm_iommu.c | 11 +++++----- drivers/iommu/mtk_iommu.c | 8 +++---- drivers/iommu/mtk_iommu_v1.c | 7 ++++-- drivers/iommu/omap-iommu.c | 12 +++++------ drivers/iommu/riscv/iommu.c | 9 +++++--- drivers/iommu/rockchip-iommu.c | 20 ++++++++++++----- drivers/iommu/s390-iommu.c | 13 ++++++----- drivers/iommu/sprd-iommu.c | 3 ++- drivers/iommu/sun50i-iommu.c | 8 ++++--- drivers/iommu/tegra-smmu.c | 10 ++++----- drivers/iommu/virtio-iommu.c | 6 ++++-- include/linux/iommu.h | 3 ++- 26 files changed, 153 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 244eb4857e7f..b7dcf07b2499 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1156,7 +1156,8 @@ EXPORT_SYMBOL_GPL(iommu_add_device); */ static int spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_table_group *table_group; @@ -1189,7 +1190,7 @@ static struct iommu_domain spapr_tce_platform_domain = { static int spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_group *grp = iommu_group_get(dev); struct iommu_table_group *table_group; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 6f4559eb5121..e16ad510c8c8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -70,8 +70,8 @@ int amd_iommu_max_glx_val = -1; */ DEFINE_IDA(pdom_ids); -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old); static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data); @@ -2635,7 +2635,8 @@ void amd_iommu_domain_free(struct iommu_domain *dom) } static int blocked_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2685,8 +2686,8 @@ void amd_iommu_init_identity_domain(void) protection_domain_init(&identity_domain); } -static int amd_iommu_attach_device(struct iommu_domain *dom, - struct device *dev) +static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, + struct iommu_domain *old) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 95a4e62b8f63..b5848770ef48 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -672,7 +672,8 @@ static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, } static int apple_dart_attach_dev_paging(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret, i; struct apple_dart_stream_map *stream_map; @@ -693,7 +694,8 @@ static int apple_dart_attach_dev_paging(struct iommu_domain *domain, } static int apple_dart_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; @@ -717,7 +719,8 @@ static struct iommu_domain apple_dart_identity_domain = { }; static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); struct apple_dart_stream_map *stream_map; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 8cd8929bbfdf..313201a61699 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -138,14 +138,15 @@ void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) } static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_nested_domain *nested_domain = to_smmu_nested_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_ste ste; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2125ebfc9a70..a33fbd12a0dd 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3002,7 +3002,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) master->ats_enabled = state->ats_enabled; } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old_domain) { int ret = 0; struct arm_smmu_ste target; @@ -3010,7 +3011,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) struct arm_smmu_device *smmu; struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_attach_state state = { - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; struct arm_smmu_master *master; @@ -3186,7 +3187,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, /* * When the last user of the CD table goes away downgrade the STE back - * to a non-cd_table one. + * to a non-cd_table one, by re-attaching its sid_domain. */ if (!arm_smmu_ssids_in_use(&master->cd_table)) { struct iommu_domain *sid_domain = @@ -3194,12 +3195,14 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain, if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || sid_domain->type == IOMMU_DOMAIN_BLOCKED) - sid_domain->ops->attach_dev(sid_domain, dev); + sid_domain->ops->attach_dev(sid_domain, dev, + sid_domain); } return 0; } static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, + struct iommu_domain *old_domain, struct device *dev, struct arm_smmu_ste *ste, unsigned int s1dss) @@ -3207,7 +3210,7 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct arm_smmu_attach_state state = { .master = master, - .old_domain = iommu_get_domain_for_dev(dev), + .old_domain = old_domain, .ssid = IOMMU_NO_PASID, }; @@ -3248,14 +3251,16 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, + STRTAB_STE_1_S1DSS_BYPASS); return 0; } @@ -3269,14 +3274,15 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old_domain) { struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); - arm_smmu_attach_dev_ste(domain, dev, &ste, + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); return 0; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 4ced4b5bee4d..5e690cf85ec9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1165,7 +1165,8 @@ static void arm_smmu_master_install_s2crs(struct arm_smmu_master_cfg *cfg, } } -static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -1234,7 +1235,8 @@ static int arm_smmu_attach_dev_type(struct device *dev, } static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); } @@ -1249,7 +1251,8 @@ static struct iommu_domain arm_smmu_identity_domain = { }; static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index c5be95e56031..9222a4a48bb3 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -359,7 +359,8 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) kfree(qcom_domain); } -static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int qcom_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); @@ -388,18 +389,18 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev } static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct qcom_iommu_domain *qcom_domain; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); unsigned int i; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - qcom_domain = to_qcom_iommu_domain(domain); + qcom_domain = to_qcom_iommu_domain(old); if (WARN_ON(!qcom_domain->iommu)) return -EINVAL; diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 0857519ca718..e375ced6e2b0 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -984,7 +984,8 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) } static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); struct exynos_iommu_domain *domain; @@ -1035,7 +1036,8 @@ static struct iommu_domain exynos_identity_domain = { }; static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); @@ -1044,7 +1046,7 @@ static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, unsigned long flags; int err; - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); if (err) return err; diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 5f08523f97cb..9664ef9840d2 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -238,7 +238,7 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val) } static int fsl_pamu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); unsigned long flags; @@ -298,9 +298,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, * switches to what looks like BLOCKING. */ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct fsl_dma_domain *dma_domain; const u32 *prop; int len; @@ -311,11 +311,11 @@ static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, * Hack to keep things working as they always have, only leaving an * UNMANAGED domain makes it BLOCKING. */ - if (domain == platform_domain || !domain || - domain->type != IOMMU_DOMAIN_UNMANAGED) + if (old == platform_domain || !old || + old->type != IOMMU_DOMAIN_UNMANAGED) return 0; - dma_domain = to_fsl_dma_domain(domain); + dma_domain = to_fsl_dma_domain(old); /* * Use LIODN of the PCI controller while detaching a diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..f0396591cd9b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3230,7 +3230,8 @@ void device_block_translation(struct device *dev) } static int blocking_domain_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); @@ -3537,7 +3538,8 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) } static int intel_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret; @@ -4401,7 +4403,9 @@ static int device_setup_pass_through(struct device *dev) context_setup_pass_through_cb, dev); } -static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +static int identity_domain_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 1b6ad9c900a5..760d7aa2ade8 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -19,7 +19,7 @@ #include "pasid.h" static int intel_nested_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ce141f095f96..2ca990dfbb88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -100,7 +100,7 @@ static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); + struct device *dev, struct iommu_domain *old); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, @@ -114,6 +114,7 @@ enum { static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags); static int __iommu_group_set_domain_internal(struct iommu_group *group, struct iommu_domain *new_domain, @@ -554,7 +555,8 @@ static void iommu_deinit_device(struct device *dev) release_domain == ops->blocked_domain) release_domain = ops->identity_domain; - release_domain->ops->attach_dev(release_domain, dev); + release_domain->ops->attach_dev(release_domain, dev, + group->domain); } if (ops->release_device) @@ -640,7 +642,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list if (group->default_domain) iommu_create_device_direct_mappings(group->default_domain, dev); if (group->domain) { - ret = __iommu_device_set_domain(group, dev, group->domain, 0); + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, + 0); if (ret) goto err_remove_gdev; } else if (!group->default_domain && !group_list) { @@ -2127,14 +2130,14 @@ static void __iommu_group_set_core_domain(struct iommu_group *group) } static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { int ret; if (unlikely(domain->ops->attach_dev == NULL)) return -ENODEV; - ret = domain->ops->attach_dev(domain, dev); + ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; dev->iommu->attach_deferred = 0; @@ -2183,7 +2186,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { if (dev->iommu && dev->iommu->attach_deferred) - return __iommu_attach_device(domain, dev); + return __iommu_attach_device(domain, dev, NULL); return 0; } @@ -2296,6 +2299,7 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, + struct iommu_domain *old_domain, unsigned int flags) { int ret; @@ -2321,7 +2325,7 @@ static int __iommu_device_set_domain(struct iommu_group *group, dev->iommu->attach_deferred = 0; } - ret = __iommu_attach_device(new_domain, dev); + ret = __iommu_attach_device(new_domain, dev, old_domain); if (ret) { /* * If we have a blocking domain then try to attach that in hopes @@ -2331,7 +2335,8 @@ static int __iommu_device_set_domain(struct iommu_group *group, if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && group->blocking_domain && group->blocking_domain != new_domain) - __iommu_attach_device(group->blocking_domain, dev); + __iommu_attach_device(group->blocking_domain, dev, + old_domain); return ret; } return 0; @@ -2378,7 +2383,7 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, result = 0; for_each_group_device(group, gdev) { ret = __iommu_device_set_domain(group, gdev->dev, new_domain, - flags); + group->domain, flags); if (ret) { result = ret; /* @@ -2413,7 +2418,7 @@ err_revert: */ if (group->domain) WARN_ON(__iommu_device_set_domain( - group, gdev->dev, group->domain, + group, gdev->dev, group->domain, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED)); } return ret; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index de178827a078..5661d2da2b67 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -216,7 +216,7 @@ static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) } static int mock_domain_nop_attach(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mock_dev *mdev = to_mock_dev(dev); struct mock_viommu *new_viommu = NULL; diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index ffa892f65714..6667ecc331f0 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -590,7 +590,7 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) } static int ipmmu_attach_device(struct iommu_domain *io_domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); @@ -637,17 +637,17 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, } static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct ipmmu_vmsa_domain *domain; unsigned int i; - if (io_domain == identity_domain || !io_domain) + if (old == identity_domain || !old) return 0; - domain = to_vmsa_domain(io_domain); + domain = to_vmsa_domain(old); for (i = 0; i < fwspec->num_ids; ++i) ipmmu_utlb_disable(domain, fwspec->ids[i]); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 43a61ba021a5..819add75a665 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -391,7 +391,8 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return &iommu->iommu; } -static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; unsigned long flags; @@ -441,19 +442,19 @@ fail: } static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct msm_priv *priv; unsigned long flags; struct msm_iommu_dev *iommu; struct msm_iommu_ctx_dev *master; int ret = 0; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - priv = to_msm_priv(domain); + priv = to_msm_priv(old); free_io_pgtable_ops(priv->iop); spin_lock_irqsave(&msm_iommu_lock, flags); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 0e0285348d2b..9747ef164413 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -705,7 +705,7 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain) } static int mtk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; struct mtk_iommu_domain *dom = to_mtk_domain(domain); @@ -773,12 +773,12 @@ err_unlock: } static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; mtk_iommu_config(data, dev, false, 0); diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 10cc0b1197e8..3b45650263ac 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -303,7 +303,9 @@ static void mtk_iommu_v1_domain_free(struct iommu_domain *domain) kfree(to_mtk_domain(domain)); } -static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) +static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); @@ -329,7 +331,8 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device } static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 5c6f5943f44b..9f0057ccea57 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1431,8 +1431,8 @@ static void omap_iommu_detach_fini(struct omap_iommu_domain *odomain) odomain->iommus = NULL; } -static int -omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int omap_iommu_attach_dev(struct iommu_domain *domain, + struct device *dev, struct iommu_domain *old) { struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); struct omap_iommu_domain *omap_domain = to_omap_domain(domain); @@ -1536,15 +1536,15 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain, } static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct omap_iommu_domain *omap_domain; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - omap_domain = to_omap_domain(domain); + omap_domain = to_omap_domain(old); spin_lock(&omap_domain->lock); _omap_iommu_detach_dev(omap_domain, dev); spin_unlock(&omap_domain->lock); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c index ebb22979075d..d9429097a2b5 100644 --- a/drivers/iommu/riscv/iommu.c +++ b/drivers/iommu/riscv/iommu.c @@ -1321,7 +1321,8 @@ static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_m } static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); struct riscv_iommu_device *iommu = dev_to_iommu(dev); @@ -1426,7 +1427,8 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) } static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); @@ -1447,7 +1449,8 @@ static struct iommu_domain riscv_iommu_blocking_domain = { }; static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct riscv_iommu_device *iommu = dev_to_iommu(dev); struct riscv_iommu_info *info = dev_iommu_priv_get(dev); diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 0861dd469bd8..85f3667e797c 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -960,7 +960,8 @@ out_disable_clocks: } static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain; @@ -1005,7 +1006,7 @@ static struct iommu_domain rk_identity_domain = { }; static int rk_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct rk_iommu *iommu; struct rk_iommu_domain *rk_domain = to_rk_domain(domain); @@ -1026,7 +1027,7 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); if (ret) return ret; @@ -1041,8 +1042,17 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, return 0; ret = rk_iommu_enable(iommu); - if (ret) - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); + if (ret) { + /* + * Note rk_iommu_identity_attach() might fail before physically + * attaching the dev to iommu->domain, in which case the actual + * old domain for this revert should be rk_identity_domain v.s. + * iommu->domain. Since rk_iommu_identity_attach() does not care + * about the old domain argument for now, this is not a problem. + */ + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, + iommu->domain)); + } pm_runtime_put(iommu->dev); diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index aa576736d60b..fe679850af28 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -670,7 +670,8 @@ int zpci_iommu_register_ioat(struct zpci_dev *zdev, u8 *status) } static int blocking_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain *s390_domain; @@ -694,7 +695,8 @@ static int blocking_domain_attach_device(struct iommu_domain *domain, } static int s390_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); @@ -709,7 +711,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); @@ -1131,13 +1133,14 @@ static int __init s390_iommu_init(void) subsys_initcall(s390_iommu_init); static int s390_attach_dev_identity(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct zpci_dev *zdev = to_zpci_dev(dev); u8 status; int cc; - blocking_domain_attach_device(&blocking_domain, dev); + blocking_domain_attach_device(&blocking_domain, dev, old); /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index c7ca1d8a0b15..555d4505c747 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -247,7 +247,8 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) } static int sprd_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); struct sprd_iommu_domain *dom = to_sprd_domain(domain); diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index de10b569d9a9..d3b190be18b5 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -771,7 +771,8 @@ static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, } static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); struct sun50i_iommu_domain *sun50i_domain; @@ -797,7 +798,8 @@ static struct iommu_domain sun50i_iommu_identity_domain = { }; static int sun50i_iommu_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); struct sun50i_iommu *iommu; @@ -813,7 +815,7 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, if (iommu->domain == domain) return 0; - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); sun50i_iommu_attach_domain(iommu, sun50i_domain); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 36cdd5fbab07..336e0a3ff41f 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -490,7 +490,7 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu, } static int tegra_smmu_attach_dev(struct iommu_domain *domain, - struct device *dev) + struct device *dev, struct iommu_domain *old) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu *smmu = dev_iommu_priv_get(dev); @@ -524,9 +524,9 @@ disable: } static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct tegra_smmu_as *as; struct tegra_smmu *smmu; @@ -535,10 +535,10 @@ static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, if (!fwspec) return -ENODEV; - if (domain == identity_domain || !domain) + if (old == identity_domain || !old) return 0; - as = to_smmu_as(domain); + as = to_smmu_as(old); smmu = as->smmu; for (index = 0; index < fwspec->num_ids; index++) { tegra_smmu_disable(smmu, fwspec->ids[index], as->id); diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b39d6f134ab2..d314fa5cd847 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -730,7 +730,8 @@ static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) return domain; } -static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) +static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; @@ -781,7 +782,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int viommu_attach_identity_domain(struct iommu_domain *domain, - struct device *dev) + struct device *dev, + struct iommu_domain *old) { int ret = 0; struct virtio_iommu_req_attach req; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..801b2bd9e8d4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -751,7 +751,8 @@ struct iommu_ops { * @free: Release the domain after use. */ struct iommu_domain_ops { - int (*attach_dev)(struct iommu_domain *domain, struct device *dev); + int (*attach_dev)(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old); -- cgit v1.2.3 From 483768846d66c04354898f00bcdaad58a3763be2 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:28 -0400 Subject: PCI: endpoint: Rename 'epf_bar::aligned_size' to 'epf_bar:mem_size' Rename the member 'epf_bar::aligned_size' to 'epf_bar::mem_size' to better reflect its purpose. 'aligned_size' was misleading, as it actually represents the backing memory size allocated for the BAR rather than the aligned size. Signed-off-by: Frank Li Signed-off-by: Manivannan Sadhasivam Reviewed-by: Niklas Cassel Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-1-9230298b1910@nxp.com --- drivers/pci/endpoint/pci-epf-core.c | 12 ++++++------ include/linux/pci-epf.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index d54e18872aef..214e3f6e6d0d 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -236,13 +236,13 @@ void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, } dev = epc->dev.parent; - dma_free_coherent(dev, epf_bar[bar].aligned_size, addr, + dma_free_coherent(dev, epf_bar[bar].mem_size, addr, epf_bar[bar].phys_addr); epf_bar[bar].phys_addr = 0; epf_bar[bar].addr = NULL; epf_bar[bar].size = 0; - epf_bar[bar].aligned_size = 0; + epf_bar[bar].mem_size = 0; epf_bar[bar].barno = 0; epf_bar[bar].flags = 0; } @@ -265,7 +265,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, enum pci_epc_interface_type type) { u64 bar_fixed_size = epc_features->bar[bar].fixed_size; - size_t aligned_size, align = epc_features->align; + size_t mem_size, align = epc_features->align; struct pci_epf_bar *epf_bar; dma_addr_t phys_addr; struct pci_epc *epc; @@ -297,7 +297,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, * it might be different if, for example, the fixed size of a BAR * is smaller than align. */ - aligned_size = align ? ALIGN(size, align) : size; + mem_size = align ? ALIGN(size, align) : size; if (type == PRIMARY_INTERFACE) { epc = epf->epc; @@ -308,7 +308,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } dev = epc->dev.parent; - space = dma_alloc_coherent(dev, aligned_size, &phys_addr, GFP_KERNEL); + space = dma_alloc_coherent(dev, mem_size, &phys_addr, GFP_KERNEL); if (!space) { dev_err(dev, "failed to allocate mem space\n"); return NULL; @@ -317,7 +317,7 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, epf_bar[bar].phys_addr = phys_addr; epf_bar[bar].addr = space; epf_bar[bar].size = size; - epf_bar[bar].aligned_size = aligned_size; + epf_bar[bar].mem_size = mem_size; epf_bar[bar].barno = bar; if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 2e85504ba2ba..4022dd080e20 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -115,8 +115,8 @@ struct pci_epf_driver { * @phys_addr: physical address that should be mapped to the BAR * @addr: virtual address corresponding to the @phys_addr * @size: the size of the address space present in BAR - * @aligned_size: the size actually allocated to accommodate the iATU alignment - * requirement + * @mem_size: the size actually allocated to accommodate the iATU alignment + * requirement * @barno: BAR number * @flags: flags that are set for the BAR */ @@ -124,7 +124,7 @@ struct pci_epf_bar { dma_addr_t phys_addr; void *addr; size_t size; - size_t aligned_size; + size_t mem_size; enum pci_barno barno; int flags; }; -- cgit v1.2.3 From 0bfc6758f213a701bd662982de86f0032b51f18c Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:30 -0400 Subject: PCI: endpoint: Add pci_epf_assign_bar_space() API Add pci_epf_assign_bar_space() API to allow setting any MMIO address as the BAR memory space, such as an MSI message base address. This API also conforms to the BAR base address and size alignment restrictions enforced by the PCI spec r6.0, sec 7.5.1.2.1. Signed-off-by: Frank Li [mani: removed unused epc var, reworded kdoc, comments and description] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-3-9230298b1910@nxp.com --- drivers/pci/endpoint/pci-epf-core.c | 77 +++++++++++++++++++++++++++++++++++++ include/linux/pci-epf.h | 6 +++ 2 files changed, 83 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index ec20aabb7f75..9a505c796370 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -346,6 +346,83 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, } EXPORT_SYMBOL_GPL(pci_epf_alloc_space); +/** + * pci_epf_assign_bar_space() - Assign PCI EPF BAR space + * @epf: EPF device to assign the BAR memory + * @size: Size of the memory that has to be assigned + * @bar: BAR number for which the memory is assigned + * @epc_features: Features provided by the EPC specific to this EPF + * @type: Identifies if the assignment is for primary EPC or secondary EPC + * @bar_addr: Address to be assigned for the @bar + * + * Invoke to assign memory for the PCI EPF BAR. + * Flag PCI_BASE_ADDRESS_MEM_TYPE_64 will automatically get set if the BAR + * can only be a 64-bit BAR, or if the requested size is larger than 2 GB. + */ +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr) +{ + size_t bar_size, aligned_mem_size; + struct pci_epf_bar *epf_bar; + dma_addr_t limit; + int pos; + + if (!size) + return -EINVAL; + + limit = bar_addr + size - 1; + + /* + * Bits: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + * bar_addr: U U U U U U 0 X X X X X X X X X + * limit: U U U U U U 1 X X X X X X X X X + * + * bar_addr^limit 0 0 0 0 0 0 1 X X X X X X X X X + * + * U: unchanged address bits in range [bar_addr, limit] + * X: bit 0 or 1 + * + * (bar_addr^limit) & BIT_ULL(pos) will find the first set bit from MSB + * (pos). And value of (2 ^ pos) should be able to cover the BAR range. + */ + for (pos = 8 * sizeof(dma_addr_t) - 1; pos > 0; pos--) + if ((limit ^ bar_addr) & BIT_ULL(pos)) + break; + + if (pos == 8 * sizeof(dma_addr_t) - 1) + return -EINVAL; + + bar_size = BIT_ULL(pos + 1); + if (pci_epf_get_required_bar_size(epf, &bar_size, &aligned_mem_size, + bar, epc_features, type)) + return -ENOMEM; + + if (type == PRIMARY_INTERFACE) + epf_bar = epf->bar; + else + epf_bar = epf->sec_epc_bar; + + epf_bar[bar].phys_addr = ALIGN_DOWN(bar_addr, aligned_mem_size); + + if (epf_bar[bar].phys_addr + bar_size < limit) + return -ENOMEM; + + epf_bar[bar].addr = NULL; + epf_bar[bar].size = bar_size; + epf_bar[bar].mem_size = aligned_mem_size; + epf_bar[bar].barno = bar; + if (upper_32_bits(size) || epc_features->bar[bar].only_64bit) + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; + else + epf_bar[bar].flags |= PCI_BASE_ADDRESS_MEM_TYPE_32; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_epf_assign_bar_space); + static void pci_epf_remove_cfs(struct pci_epf_driver *driver) { struct config_group *group, *tmp; diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 4022dd080e20..48f68c4dcfa5 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -242,6 +242,12 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, enum pci_epc_interface_type type); +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr); + int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar, u64 addr, dma_addr_t *base, size_t *off); int pci_epf_bind(struct pci_epf *epf); -- cgit v1.2.3 From 90a18c512884adb49ddc2fb30e94594169aae808 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:50 +0200 Subject: pinctrl: pinconf-generic: Handle string values for generic properties Allow a generic pinconf property to specify its argument as one of the strings in a match list. Convert the matching string to an integer value using the index in the list, then keep using this value in the generic pinconf code. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 57 ++++++++++++++++++++++++--------- include/linux/pinctrl/pinconf-generic.h | 11 +++++-- 2 files changed, 50 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index e3d10bbcdaeb..72906d71ae1a 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -65,11 +65,12 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, int i; for (i = 0; i < nitems; i++) { + const struct pin_config_item *item = &items[i]; unsigned long config; int ret; /* We want to check out this parameter */ - config = pinconf_to_config_packed(items[i].param, 0); + config = pinconf_to_config_packed(item->param, 0); if (gname) ret = pin_config_group_get(dev_name(pctldev->dev), gname, &config); @@ -86,15 +87,22 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, if (*print_sep) seq_puts(s, ", "); *print_sep = 1; - seq_puts(s, items[i].display); + seq_puts(s, item->display); /* Print unit if available */ - if (items[i].has_arg) { + if (item->has_arg) { u32 val = pinconf_to_config_argument(config); - if (items[i].format) - seq_printf(s, " (%u %s)", val, items[i].format); + if (item->format) + seq_printf(s, " (%u %s)", val, item->format); else seq_printf(s, " (0x%x)", val); + + if (item->values && item->num_values) { + if (val < item->num_values) + seq_printf(s, " \"%s\"", item->values[val]); + else + seq_puts(s, " \"(unknown)\""); + } } } } @@ -205,10 +213,10 @@ static const struct pinconf_generic_params dt_params[] = { * @ncfg. @ncfg is updated to reflect the number of entries after parsing. @cfg * needs to have enough memory allocated to hold all possible entries. */ -static void parse_dt_cfg(struct device_node *np, - const struct pinconf_generic_params *params, - unsigned int count, unsigned long *cfg, - unsigned int *ncfg) +static int parse_dt_cfg(struct device_node *np, + const struct pinconf_generic_params *params, + unsigned int count, unsigned long *cfg, + unsigned int *ncfg) { int i; @@ -217,7 +225,19 @@ static void parse_dt_cfg(struct device_node *np, int ret; const struct pinconf_generic_params *par = ¶ms[i]; - ret = of_property_read_u32(np, par->property, &val); + if (par->values && par->num_values) { + ret = fwnode_property_match_property_string(of_fwnode_handle(np), + par->property, + par->values, par->num_values); + if (ret == -ENOENT) + return ret; + if (ret >= 0) { + val = ret; + ret = 0; + } + } else { + ret = of_property_read_u32(np, par->property, &val); + } /* property not found */ if (ret == -EINVAL) @@ -231,6 +251,8 @@ static void parse_dt_cfg(struct device_node *np, cfg[*ncfg] = pinconf_to_config_packed(par->param, val); (*ncfg)++; } + + return 0; } /** @@ -323,13 +345,16 @@ int pinconf_generic_parse_dt_config(struct device_node *np, if (!cfg) return -ENOMEM; - parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg); + ret = parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg); + if (ret) + return ret; if (pctldev && pctldev->desc->num_custom_params && - pctldev->desc->custom_params) - parse_dt_cfg(np, pctldev->desc->custom_params, - pctldev->desc->num_custom_params, cfg, &ncfg); - - ret = 0; + pctldev->desc->custom_params) { + ret = parse_dt_cfg(np, pctldev->desc->custom_params, + pctldev->desc->num_custom_params, cfg, &ncfg); + if (ret) + return ret; + } /* no configs found at all */ if (ncfg == 0) { diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index d9245ecec71d..f82add5d3302 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -181,21 +181,28 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param return PIN_CONF_PACKED(param, argument); } -#define PCONFDUMP(a, b, c, d) { \ - .param = a, .display = b, .format = c, .has_arg = d \ +#define PCONFDUMP_WITH_VALUES(a, b, c, d, e, f) { \ + .param = a, .display = b, .format = c, .has_arg = d, \ + .values = e, .num_values = f \ } +#define PCONFDUMP(a, b, c, d) PCONFDUMP_WITH_VALUES(a, b, c, d, NULL, 0) + struct pin_config_item { const enum pin_config_param param; const char * const display; const char * const format; bool has_arg; + const char * const *values; + size_t num_values; }; struct pinconf_generic_params { const char * const property; enum pin_config_param param; u32 default_value; + const char * const *values; + size_t num_values; }; int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 55c7f5ef904fc2dcc7ef5945c5efb0cd60b46d32 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:51 +0200 Subject: pinctrl: pinconf-generic: Add properties 'skew-delay-{in,out}put-ps' Add the properties 'skew-delay-input-ps' and 'skew-delay-output-ps' to the generic parameters used for parsing DT files. This allows to specify the independent skew delay value for the two directions. This enables drivers that use the generic pin configuration to get the value passed through these new properties. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 4 ++++ include/linux/pinctrl/pinconf-generic.h | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index 72906d71ae1a..366775841c63 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -54,6 +54,8 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL, true), PCONFDUMP(PIN_CONFIG_SKEW_DELAY, "skew delay", NULL, true), + PCONFDUMP(PIN_CONFIG_SKEW_DELAY_INPUT_PS, "input skew delay", "ps", true), + PCONFDUMP(PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, "output skew delay", "ps", true), }; static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, @@ -198,6 +200,8 @@ static const struct pinconf_generic_params dt_params[] = { { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, { "skew-delay", PIN_CONFIG_SKEW_DELAY, 0 }, + { "skew-delay-input-ps", PIN_CONFIG_SKEW_DELAY_INPUT_PS, 0 }, + { "skew-delay-output-ps", PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, 0 }, }; /** diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index f82add5d3302..1be4032071c2 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -112,6 +112,12 @@ struct pinctrl_map; * or latch delay (on outputs) this parameter (in a custom format) * specifies the clock skew or latch delay. It typically controls how * many double inverters are put in front of the line. + * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the clock skew only. The argument is in ps. + * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to * this parameter (on a custom format) tells the driver which alternative @@ -147,6 +153,8 @@ enum pin_config_param { PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_POWER_SOURCE, PIN_CONFIG_SKEW_DELAY, + PIN_CONFIG_SKEW_DELAY_INPUT_PS, + PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, PIN_CONFIG_SLEEP_HARDWARE_STATE, PIN_CONFIG_SLEW_RATE, PIN_CONFIG_END = 0x7F, -- cgit v1.2.3 From bcce8c74f1ce1e2731ac0261287897e3768767d8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 24 Oct 2025 15:46:21 -0700 Subject: PCI: Enable host bridge emulation for PCI_DOMAINS_GENERIC platforms The ability to emulate a host bridge is useful not only for hardware PCI controllers like CONFIG_VMD, or virtual PCI controllers like CONFIG_PCI_HYPERV, but also for test and development scenarios like CONFIG_SAMPLES_DEVSEC [1]. One stumbling block for defining CONFIG_SAMPLES_DEVSEC, a sample implementation of a platform TSM for PCI Device Security, is the need to accommodate PCI_DOMAINS_GENERIC architectures alongside x86 [2]. In support of supplementing the existing CONFIG_PCI_BRIDGE_EMUL infrastructure for host bridges: * Introduce pci_bus_find_emul_domain_nr() as a common way to find a free PCI domain number whether that is to reuse the existing dynamic allocation code in the !ACPI case, or to assign an unused domain above the last ACPI segment. * Convert pci-hyperv to the new allocator so that the PCI core can unconditionally assume that bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET is the dynamically allocated case. A follow on patch can also convert vmd to the new scheme. Currently vmd is limited to CONFIG_PCI_DOMAINS_GENERIC=n (x86) so, unlike pci-hyperv, it does not immediately conflict with this new pci_bus_find_emul_domain_nr() mechanism. Link: http://lore.kernel.org/174107249038.1288555.12362100502109498455.stgit@dwillia2-xfh.jf.intel.com [1] Reported-by: Suzuki K Poulose Closes: http://lore.kernel.org/20250311144601.145736-3-suzuki.poulose@arm.com [2] Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Suzuki K Poulose Tested-by: Michael Kelley Reviewed-by: Michael Kelley Cc: Lorenzo Pieralisi Cc: Manivannan Sadhasivam Cc: Rob Herring Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Link: https://patch.msgid.link/20251024224622.1470555-2-dan.j.williams@intel.com --- drivers/pci/controller/pci-hyperv.c | 62 ++++++------------------------------- drivers/pci/pci.c | 24 +++++++++++++- drivers/pci/probe.c | 8 ++++- include/linux/pci.h | 7 +++++ 4 files changed, 46 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 146b43981b27..1e237d3538f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3696,48 +3696,6 @@ static int hv_send_resources_released(struct hv_device *hdev) return 0; } -#define HVPCI_DOM_MAP_SIZE (64 * 1024) -static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); - -/* - * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 - * as invalid for passthrough PCI devices of this driver. - */ -#define HVPCI_DOM_INVALID 0 - -/** - * hv_get_dom_num() - Get a valid PCI domain number - * Check if the PCI domain number is in use, and return another number if - * it is in use. - * - * @dom: Requested domain number - * - * return: domain number on success, HVPCI_DOM_INVALID on failure - */ -static u16 hv_get_dom_num(u16 dom) -{ - unsigned int i; - - if (test_and_set_bit(dom, hvpci_dom_map) == 0) - return dom; - - for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { - if (test_and_set_bit(i, hvpci_dom_map) == 0) - return i; - } - - return HVPCI_DOM_INVALID; -} - -/** - * hv_put_dom_num() - Mark the PCI domain number as free - * @dom: Domain number to be freed - */ -static void hv_put_dom_num(u16 dom) -{ - clear_bit(dom, hvpci_dom_map); -} - /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -3750,9 +3708,9 @@ static int hv_pci_probe(struct hv_device *hdev, { struct pci_host_bridge *bridge; struct hv_pcibus_device *hbus; - u16 dom_req, dom; + int ret, dom; + u16 dom_req; char *name; - int ret; bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); if (!bridge) @@ -3779,11 +3737,14 @@ static int hv_pci_probe(struct hv_device *hdev, * PCI bus (which is actually emulated by the hypervisor) is domain 0. * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. + * + * Because Gen1 VMs use domain 0, don't allow picking domain 0 here, + * even if bytes 4 and 5 of the instance GUID are both zero. For wider + * userspace compatibility, limit the domain ID to a 16-bit value. */ dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; - dom = hv_get_dom_num(dom_req); - - if (dom == HVPCI_DOM_INVALID) { + dom = pci_bus_find_emul_domain_nr(dom_req, 1, U16_MAX); + if (dom < 0) { dev_err(&hdev->device, "Unable to use dom# 0x%x or other numbers", dom_req); ret = -EINVAL; @@ -3917,7 +3878,7 @@ close: destroy_wq: destroy_workqueue(hbus->wq); free_dom: - hv_put_dom_num(hbus->bridge->domain_nr); + pci_bus_release_emul_domain_nr(hbus->bridge->domain_nr); free_bus: kfree(hbus); return ret; @@ -4042,8 +4003,6 @@ static void hv_pci_remove(struct hv_device *hdev) irq_domain_remove(hbus->irq_domain); irq_domain_free_fwnode(hbus->fwnode); - hv_put_dom_num(hbus->bridge->domain_nr); - kfree(hbus); } @@ -4217,9 +4176,6 @@ static int __init init_hv_pci_drv(void) if (ret) return ret; - /* Set the invalid domain number's bit, so it will not be used */ - set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); - /* Initialize PCI block r/w interface */ hvpci_block_ops.read_block = hv_read_config_block; hvpci_block_ops.write_block = hv_write_config_block; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b14dd064006c..f4ccb948e59d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6656,9 +6656,31 @@ static void pci_no_domains(void) #endif } +#ifdef CONFIG_PCI_DOMAINS +static DEFINE_IDA(pci_domain_nr_dynamic_ida); + +/** + * pci_bus_find_emul_domain_nr() - allocate a PCI domain number per constraints + * @hint: desired domain, 0 if any ID in the range of @min to @max is acceptable + * @min: minimum allowable domain + * @max: maximum allowable domain, no IDs higher than INT_MAX will be returned + */ +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return ida_alloc_range(&pci_domain_nr_dynamic_ida, max(hint, min), max, + GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(pci_bus_find_emul_domain_nr); + +void pci_bus_release_emul_domain_nr(int domain_nr) +{ + ida_free(&pci_domain_nr_dynamic_ida, domain_nr); +} +EXPORT_SYMBOL_GPL(pci_bus_release_emul_domain_nr); +#endif + #ifdef CONFIG_PCI_DOMAINS_GENERIC static DEFINE_IDA(pci_domain_nr_static_ida); -static DEFINE_IDA(pci_domain_nr_dynamic_ida); static void of_pci_reserve_static_domain_nr(void) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c83e75a0ec12..c3f6e2714440 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -657,6 +657,11 @@ static void pci_release_host_bridge_dev(struct device *dev) pci_free_resource_list(&bridge->windows); pci_free_resource_list(&bridge->dma_ranges); + + /* Host bridges only have domain_nr set in the emulation case */ + if (bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_emul_domain_nr(bridge->domain_nr); + kfree(bridge); } @@ -1137,7 +1142,8 @@ unregister: device_del(&bridge->dev); free: #ifdef CONFIG_PCI_DOMAINS_GENERIC - pci_bus_release_domain_nr(parent, bus->domain_nr); + if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET) + pci_bus_release_domain_nr(parent, bus->domain_nr); #endif if (bus_registered) put_device(&bus->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..1ef1535802b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1956,10 +1956,17 @@ DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) */ #ifdef CONFIG_PCI_DOMAINS extern int pci_domains_supported; +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max); +void pci_bus_release_emul_domain_nr(int domain_nr); #else enum { pci_domains_supported = 0 }; static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } +static inline int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return 0; +} +static inline void pci_bus_release_emul_domain_nr(int domain_nr) { } #endif /* CONFIG_PCI_DOMAINS */ /* -- cgit v1.2.3 From 32e0f607ac6a2bb5d144540897535fd01be77586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:24 -0400 Subject: tracing: Add trace_seq_pop() and seq_buf_pop() In order to allow an interface to remove an added character from the trace_seq and seq_buf descriptors, add helper functions trace_seq_pop() and seq_buf_pop(). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.594898736@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 17 +++++++++++++++++ include/linux/trace_seq.h | 13 +++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..9f2839e73f8a 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num) } } +/** + * seq_buf_pop - pop off the last written character + * @s: the seq_buf handle + * + * Removes the last written character to the seq_buf @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int seq_buf_pop(struct seq_buf *s) +{ + if (!s->len) + return -1; + + s->len--; + return (unsigned int)s->buffer[s->len]; +} + extern __printf(2, 3) int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); extern __printf(2, 0) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 557780fe1c77..4a0b8c172d27 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s) return s->full || seq_buf_has_overflowed(&s->seq); } +/** + * trace_seq_pop - pop off the last written character + * @s: trace sequence descriptor + * + * Removes the last written character to the trace_seq @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int trace_seq_pop(struct trace_seq *s) +{ + return seq_buf_pop(&s->seq); +} + /* * Currently only defined when tracing is enabled. */ -- cgit v1.2.3 From f74ee32963f1b74865fe679e2475450434fea51c Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Tue, 28 Oct 2025 20:09:00 +0800 Subject: tools/dma: move dma_map_benchmark from selftests to tools/dma dma_map_benchmark is a standalone developer tool rather than an automated selftest. It has no pass/fail criteria, expects manual invocation, and is built as a normal userspace binary. Move it to tools/dma/ and add a minimal Makefile. Suggested-by: Marek Szyprowski Suggested-by: Barry Song Signed-off-by: Qinxin Xia Acked-by: Barry Song Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251028120900.2265511-3-xiaqinxin@huawei.com --- include/linux/map_benchmark.h | 32 ------ include/uapi/linux/map_benchmark.h | 35 +++++++ kernel/dma/map_benchmark.c | 2 +- tools/Makefile | 13 +-- tools/dma/.gitignore | 3 + tools/dma/Makefile | 55 ++++++++++ tools/dma/config | 1 + tools/dma/dma_map_benchmark.c | 127 +++++++++++++++++++++++ tools/testing/selftests/dma/Makefile | 7 -- tools/testing/selftests/dma/config | 1 - tools/testing/selftests/dma/dma_map_benchmark.c | 128 ------------------------ 11 files changed, 229 insertions(+), 175 deletions(-) delete mode 100644 include/linux/map_benchmark.h create mode 100644 include/uapi/linux/map_benchmark.h create mode 100644 tools/dma/.gitignore create mode 100644 tools/dma/Makefile create mode 100644 tools/dma/config create mode 100644 tools/dma/dma_map_benchmark.c delete mode 100644 tools/testing/selftests/dma/Makefile delete mode 100644 tools/testing/selftests/dma/config delete mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c (limited to 'include/linux') diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h deleted file mode 100644 index 48e2ff95332f..000000000000 --- a/include/linux/map_benchmark.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2022 HiSilicon Limited. - */ - -#ifndef _KERNEL_DMA_BENCHMARK_H -#define _KERNEL_DMA_BENCHMARK_H - -#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) -#define DMA_MAP_MAX_THREADS 1024 -#define DMA_MAP_MAX_SECONDS 300 -#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) - -#define DMA_MAP_BIDIRECTIONAL 0 -#define DMA_MAP_TO_DEVICE 1 -#define DMA_MAP_FROM_DEVICE 2 - -struct map_benchmark { - __u64 avg_map_100ns; /* average map latency in 100ns */ - __u64 map_stddev; /* standard deviation of map latency */ - __u64 avg_unmap_100ns; /* as above */ - __u64 unmap_stddev; - __u32 threads; /* how many threads will do map/unmap in parallel */ - __u32 seconds; /* how long the test will last */ - __s32 node; /* which numa node this benchmark will run on */ - __u32 dma_bits; /* DMA addressing capability */ - __u32 dma_dir; /* DMA data direction */ - __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ -}; -#endif /* _KERNEL_DMA_BENCHMARK_H */ diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h new file mode 100644 index 000000000000..c2d91088a40d --- /dev/null +++ b/include/uapi/linux/map_benchmark.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2025 HiSilicon Limited. + */ + +#ifndef _UAPI_DMA_BENCHMARK_H +#define _UAPI_DMA_BENCHMARK_H + +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ +}; + +#endif /* _UAPI_DMA_BENCHMARK_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index cc19a3efea89..794041a39e65 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,13 +11,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include struct map_benchmark_data { struct map_benchmark bparam; diff --git a/tools/Makefile b/tools/Makefile index c31cbbd12c45..cb40961a740f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -14,6 +14,7 @@ help: @echo ' counter - counter tools' @echo ' cpupower - a tool for all things x86 CPU power' @echo ' debugging - tools for debugging' + @echo ' dma - tools for DMA mapping' @echo ' firewire - the userspace part of nosy, an IEEE-1394 traffic sniffer' @echo ' firmware - Firmware tools' @echo ' freefall - laptop accelerometer program for disk protection' @@ -69,7 +70,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE +counter dma firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -122,7 +123,7 @@ kvm_stat: FORCE ynl: FORCE $(call descend,net/ynl) -all: acpi counter cpupower gpio hv firewire \ +all: acpi counter cpupower dma gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ @@ -134,7 +135,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: +counter_install dma_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -164,7 +165,7 @@ kvm_stat_install: ynl_install: $(call descend,net/$(@:_install=),install) -install: acpi_install counter_install cpupower_install gpio_install \ +install: acpi_install counter_install cpupower_install dma_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install mm_install bpf_install x86_energy_perf_policy_install \ @@ -178,7 +179,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: +counter_clean dma_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -224,7 +225,7 @@ build_clean: ynl_clean: $(call descend,net/$(@:_clean=),clean) -clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ +clean: acpi_clean counter_clean cpupower_clean dma_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ diff --git a/tools/dma/.gitignore b/tools/dma/.gitignore new file mode 100644 index 000000000000..94b68cf4147b --- /dev/null +++ b/tools/dma/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +dma_map_benchmark +include/linux/map_benchmark.h diff --git a/tools/dma/Makefile b/tools/dma/Makefile new file mode 100644 index 000000000000..e4abf37bf020 --- /dev/null +++ b/tools/dma/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +include ../scripts/Makefile.include + +bindir ?= /usr/bin + +# This will work when dma is built in tools env. where srctree +# isn't set and when invoked from selftests build, where srctree +# is set to ".". building_out_of_srctree is undefined for in srctree +# builds +ifndef building_out_of_srctree +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := dma_map_benchmark +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) + +all: $(ALL_PROGRAMS) + +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +# +# We need the following to be outside of kernel tree +# +$(OUTPUT)include/linux/map_benchmark.h: ../../include/uapi/linux/map_benchmark.h + mkdir -p $(OUTPUT)include/linux 2>&1 || true + ln -sf $(CURDIR)/../../include/uapi/linux/map_benchmark.h $@ + +prepare: $(OUTPUT)include/linux/map_benchmark.h + +FORCE: + +DMA_MAP_BENCHMARK = dma_map_benchmark +$(DMA_MAP_BENCHMARK): prepare FORCE + $(CC) $(CFLAGS) $(DMA_MAP_BENCHMARK).c -o $(DMA_MAP_BENCHMARK) + +clean: + rm -f $(ALL_PROGRAMS) + rm -rf $(OUTPUT)include + find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -o -name '\.*.cmd' -delete + +install: $(ALL_PROGRAMS) + install -d -m 755 $(DESTDIR)$(bindir); \ + for program in $(ALL_PROGRAMS); do \ + install $$program $(DESTDIR)$(bindir); \ + done + +.PHONY: all install clean prepare FORCE diff --git a/tools/dma/config b/tools/dma/config new file mode 100644 index 000000000000..6102ee3c43cd --- /dev/null +++ b/tools/dma/config @@ -0,0 +1 @@ +CONFIG_DMA_MAP_BENCHMARK=y diff --git a/tools/dma/dma_map_benchmark.c b/tools/dma/dma_map_benchmark.c new file mode 100644 index 000000000000..5474a450863c --- /dev/null +++ b/tools/dma/dma_map_benchmark.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 HiSilicon Limited. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NSEC_PER_MSEC 1000000L + +static char *directions[] = { + "BIDIRECTIONAL", + "TO_DEVICE", + "FROM_DEVICE", +}; + +int main(int argc, char **argv) +{ + struct map_benchmark map; + int fd, opt; + /* default single thread, run 20 seconds on NUMA_NO_NODE */ + int threads = 1, seconds = 20, node = -1; + /* default dma mask 32bit, bidirectional DMA */ + int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; + /* default granule 1 PAGESIZE */ + int granule = 1; + + int cmd = DMA_MAP_BENCHMARK; + + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { + switch (opt) { + case 't': + threads = atoi(optarg); + break; + case 's': + seconds = atoi(optarg); + break; + case 'n': + node = atoi(optarg); + break; + case 'b': + bits = atoi(optarg); + break; + case 'd': + dir = atoi(optarg); + break; + case 'x': + xdelay = atoi(optarg); + break; + case 'g': + granule = atoi(optarg); + break; + default: + return -1; + } + } + + if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { + fprintf(stderr, "invalid number of threads, must be in 1-%d\n", + DMA_MAP_MAX_THREADS); + exit(1); + } + + if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) { + fprintf(stderr, "invalid number of seconds, must be in 1-%d\n", + DMA_MAP_MAX_SECONDS); + exit(1); + } + + if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { + fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", + DMA_MAP_MAX_TRANS_DELAY); + exit(1); + } + + /* suppose the mininum DMA zone is 1MB in the world */ + if (bits < 20 || bits > 64) { + fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); + exit(1); + } + + if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE && + dir != DMA_MAP_FROM_DEVICE) { + fprintf(stderr, "invalid dma direction\n"); + exit(1); + } + + if (granule < 1 || granule > 1024) { + fprintf(stderr, "invalid granule size\n"); + exit(1); + } + + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); + if (fd == -1) { + perror("open"); + exit(1); + } + + memset(&map, 0, sizeof(map)); + map.seconds = seconds; + map.threads = threads; + map.node = node; + map.dma_bits = bits; + map.dma_dir = dir; + map.dma_trans_ns = xdelay; + map.granule = granule; + + if (ioctl(fd, cmd, &map)) { + perror("ioctl"); + exit(1); + } + + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", + threads, seconds, node, dir[directions], granule); + printf("average map latency(us):%.1f standard deviation:%.1f\n", + map.avg_map_100ns/10.0, map.map_stddev/10.0); + printf("average unmap latency(us):%.1f standard deviation:%.1f\n", + map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0); + + return 0; +} diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile deleted file mode 100644 index cd8c5ece1cba..000000000000 --- a/tools/testing/selftests/dma/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -CFLAGS += -I../../../../usr/include/ -CFLAGS += -I../../../../include/ - -TEST_GEN_PROGS := dma_map_benchmark - -include ../lib.mk diff --git a/tools/testing/selftests/dma/config b/tools/testing/selftests/dma/config deleted file mode 100644 index 6102ee3c43cd..000000000000 --- a/tools/testing/selftests/dma/config +++ /dev/null @@ -1 +0,0 @@ -CONFIG_DMA_MAP_BENCHMARK=y diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c deleted file mode 100644 index b12f1f9babf8..000000000000 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2020 HiSilicon Limited. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NSEC_PER_MSEC 1000000L - -static char *directions[] = { - "BIDIRECTIONAL", - "TO_DEVICE", - "FROM_DEVICE", -}; - -int main(int argc, char **argv) -{ - struct map_benchmark map; - int fd, opt; - /* default single thread, run 20 seconds on NUMA_NO_NODE */ - int threads = 1, seconds = 20, node = -1; - /* default dma mask 32bit, bidirectional DMA */ - int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; - /* default granule 1 PAGESIZE */ - int granule = 1; - - int cmd = DMA_MAP_BENCHMARK; - - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { - switch (opt) { - case 't': - threads = atoi(optarg); - break; - case 's': - seconds = atoi(optarg); - break; - case 'n': - node = atoi(optarg); - break; - case 'b': - bits = atoi(optarg); - break; - case 'd': - dir = atoi(optarg); - break; - case 'x': - xdelay = atoi(optarg); - break; - case 'g': - granule = atoi(optarg); - break; - default: - return -1; - } - } - - if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { - fprintf(stderr, "invalid number of threads, must be in 1-%d\n", - DMA_MAP_MAX_THREADS); - exit(1); - } - - if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) { - fprintf(stderr, "invalid number of seconds, must be in 1-%d\n", - DMA_MAP_MAX_SECONDS); - exit(1); - } - - if (xdelay < 0 || xdelay > DMA_MAP_MAX_TRANS_DELAY) { - fprintf(stderr, "invalid transmit delay, must be in 0-%ld\n", - DMA_MAP_MAX_TRANS_DELAY); - exit(1); - } - - /* suppose the mininum DMA zone is 1MB in the world */ - if (bits < 20 || bits > 64) { - fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); - exit(1); - } - - if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE && - dir != DMA_MAP_FROM_DEVICE) { - fprintf(stderr, "invalid dma direction\n"); - exit(1); - } - - if (granule < 1 || granule > 1024) { - fprintf(stderr, "invalid granule size\n"); - exit(1); - } - - fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); - if (fd == -1) { - perror("open"); - exit(1); - } - - memset(&map, 0, sizeof(map)); - map.seconds = seconds; - map.threads = threads; - map.node = node; - map.dma_bits = bits; - map.dma_dir = dir; - map.dma_trans_ns = xdelay; - map.granule = granule; - - if (ioctl(fd, cmd, &map)) { - perror("ioctl"); - exit(1); - } - - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", - threads, seconds, node, dir[directions], granule); - printf("average map latency(us):%.1f standard deviation:%.1f\n", - map.avg_map_100ns/10.0, map.map_stddev/10.0); - printf("average unmap latency(us):%.1f standard deviation:%.1f\n", - map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0); - - return 0; -} -- cgit v1.2.3 From ed7fc3cbb38ffdca7a189e15982ce96acab4684c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:47 +0300 Subject: dma-mapping: prepare dma_map_ops to conversion to physical address Add new .map_phys() and .unmap_phys() callbacks to dma_map_ops as a preparation to replace .map_page() and .unmap_page() respectively. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-1-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 +++++++ kernel/dma/mapping.c | 4 ++++ kernel/dma/ops_helpers.c | 12 ++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 10882d00cb17..79d2a74d4b49 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -37,6 +37,13 @@ struct dma_map_ops { void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); + + dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*unmap_phys)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); /* * map_sg should return a negative error code on error. See * dma_map_sgtable() for a list of appropriate error codes diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index fe7472f13b10..4080aebe5deb 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -169,6 +169,8 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); + else if (ops->map_phys) + addr = ops->map_phys(dev, phys, size, dir, attrs); else if (is_mmio) { if (!ops->map_resource) return DMA_MAPPING_ERROR; @@ -223,6 +225,8 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); + else if (ops->unmap_phys) + ops->unmap_phys(dev, addr, size, dir, attrs); else if (is_mmio) { if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 6f9d604d9d40..1eccbdbc99c1 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -64,6 +64,7 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); struct page *page; + phys_addr_t phys; page = dma_alloc_contiguous(dev, size, gfp); if (!page) @@ -71,9 +72,13 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + phys = page_to_phys(page); if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, - dir, DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->map_phys) + *dma_handle = ops->map_phys(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -94,6 +99,9 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->unmap_phys) + ops->unmap_phys(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); -- cgit v1.2.3 From 14cb413af00c5d3950d1a339dd2b6f01ce313fce Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:52 +0300 Subject: dma-mapping: remove unused mapping resource callbacks After ARM and XEN conversions to use physical addresses for the mapping, there are no in-kernel users for map_resource/unmap_resource callbacks, so remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-6-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 6 ------ kernel/dma/mapping.c | 16 ++++------------ 2 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 79d2a74d4b49..2e98ecc313a3 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -53,12 +53,6 @@ struct dma_map_ops { enum dma_data_direction dir, unsigned long attrs); void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); - dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct device *dev, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 4080aebe5deb..32a85bfdf873 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,7 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; - dma_addr_t addr; + dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -171,18 +171,13 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) addr = ops->map_phys(dev, phys, size, dir, attrs); - else if (is_mmio) { - if (!ops->map_resource) - return DMA_MAPPING_ERROR; - - addr = ops->map_resource(dev, phys, size, dir, attrs); - } else { + else if (!is_mmio && ops->map_page) { struct page *page = phys_to_page(phys); size_t offset = offset_in_page(phys); /* * The dma_ops API contract for ops->map_page() requires - * kmappable memory, while ops->map_resource() does not. + * kmappable memory. */ addr = ops->map_page(dev, page, offset, size, dir, attrs); } @@ -227,10 +222,7 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) ops->unmap_phys(dev, addr, size, dir, attrs); - else if (is_mmio) { - if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - } else + else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); -- cgit v1.2.3 From 131971f67e258170c678fe572fda95f8cef88e66 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:13:00 +0300 Subject: dma-mapping: remove unused map_page callback After conversion of arch code to use physical address mapping, there are no users of .map_page() and .unmap_page() callbacks, so let's remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-14-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 ------- kernel/dma/mapping.c | 12 ------------ kernel/dma/ops_helpers.c | 8 +------- 3 files changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 2e98ecc313a3..4809204c674c 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -31,13 +31,6 @@ struct dma_map_ops { void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); - void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 32a85bfdf873..37163eb49f9f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -171,16 +171,6 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) addr = ops->map_phys(dev, phys, size, dir, attrs); - else if (!is_mmio && ops->map_page) { - struct page *page = phys_to_page(phys); - size_t offset = offset_in_page(phys); - - /* - * The dma_ops API contract for ops->map_page() requires - * kmappable memory. - */ - addr = ops->map_page(dev, page, offset, size, dir, attrs); - } if (!is_mmio) kmsan_handle_dma(phys, size, dir); @@ -222,8 +212,6 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) ops->unmap_phys(dev, addr, size, dir, attrs); - else - ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 1eccbdbc99c1..20caf9cabf69 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -76,11 +76,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (use_dma_iommu(dev)) *dma_handle = iommu_dma_map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->map_phys) - *dma_handle = ops->map_phys(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); else - *dma_handle = ops->map_page(dev, page, 0, size, dir, + *dma_handle = ops->map_phys(dev, phys, size, dir, DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); @@ -102,8 +99,5 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, else if (ops->unmap_phys) ops->unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); - else if (ops->unmap_page) - ops->unmap_page(dev, dma_handle, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); } -- cgit v1.2.3 From 00b3e8480be7a49203594bd1fdb4fd46f3b69d59 Mon Sep 17 00:00:00 2001 From: Izhar Ameer Shaikh Date: Tue, 21 Oct 2025 17:00:01 +0530 Subject: scsi: firmware: xilinx: Add support for secure read/write ioctl interface Add support for a generic ioctl read/write interface using which users can request firmware to perform read/write operations on a protected and secure address space. The functionality is introduced through the means of two new IOCTL IDs which extend the existing PM_IOCTL EEMI API: - IOCTL_READ_REG - IOCTL_MASK_WRITE_REG The caller only passes the node id of the given device and an offset. The base address is not exposed to the caller and internally retrieved by the firmware. Firmware will enforce an access policy on the incoming read/write request. Signed-off-by: Izhar Ameer Shaikh Reviewed-by: Tanmay Shah Signed-off-by: Radhey Shyam Pandey Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-3-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- drivers/firmware/xilinx/zynqmp.c | 46 ++++++++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 15 ++++++++++++ 2 files changed, 61 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 02da3e48bc8f..b7cd0eca9eaa 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -1616,6 +1616,52 @@ int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, return zynqmp_pm_invoke_fn(PM_IOCTL, payload, 3, 0, IOCTL_GET_FEATURE_CONFIG, id); } +/** + * zynqmp_pm_sec_read_reg - PM call to securely read from given offset + * of the node + * @node_id: Node Id of the device + * @offset: Offset to be used (20-bit) + * @ret_value: Output data read from the given offset after + * firmware access policy is successfully enforced + * + * Return: Returns 0 on success or error value on failure + */ +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + u32 ret_payload[PAYLOAD_ARG_CNT]; + u32 count = 1; + int ret; + + if (!ret_value) + return -EINVAL; + + ret = zynqmp_pm_invoke_fn(PM_IOCTL, ret_payload, 4, node_id, IOCTL_READ_REG, + offset, count); + + *ret_value = ret_payload[1]; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_sec_read_reg); + +/** + * zynqmp_pm_sec_mask_write_reg - PM call to securely write to given offset + * of the node + * @node_id: Node Id of the device + * @offset: Offset to be used (20-bit) + * @mask: Mask to be used + * @value: Value to be written + * + * Return: Returns 0 on success or error value on failure + */ +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, u32 mask, + u32 value) +{ + return zynqmp_pm_invoke_fn(PM_IOCTL, NULL, 5, node_id, IOCTL_MASK_WRITE_REG, + offset, mask, value); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_sec_mask_write_reg); + /** * zynqmp_pm_set_sd_config - PM call to set value of SD config registers * @node: SD node ID diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..b161f37de5cc 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -241,6 +241,7 @@ enum pm_ioctl_id { IOCTL_GET_FEATURE_CONFIG = 27, /* IOCTL for Secure Read/Write Interface */ IOCTL_READ_REG = 28, + IOCTL_MASK_WRITE_REG = 29, /* Dynamic SD/GEM configuration */ IOCTL_SET_SD_CONFIG = 30, IOCTL_SET_GEM_CONFIG = 31, @@ -619,6 +620,9 @@ int zynqmp_pm_feature(const u32 api_id); int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value); +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); int zynqmp_pm_force_pwrdwn(const u32 target, const enum zynqmp_pm_request_ack ack); @@ -916,6 +920,17 @@ static inline int zynqmp_pm_request_wake(const u32 node, return -ENODEV; } +static inline int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value) +{ + return -ENODEV; +} + static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode) { return -ENODEV; -- cgit v1.2.3 From 0e4d26f79a74bc633846a27a9a20d52217c108dc Mon Sep 17 00:00:00 2001 From: Ajay Neeli Date: Tue, 21 Oct 2025 17:00:02 +0530 Subject: scsi: firmware: xilinx: Add APIs for UFS PHY initialization - Add APIs for UFS PHY initialization. - Verify M-PHY TX-RX configuration readiness. - Confirm SRAM initialization and Set SRAM bypass. - Retrieve UFS calibration values. Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-4-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- drivers/firmware/xilinx/Makefile | 2 +- drivers/firmware/xilinx/zynqmp-ufs.c | 118 +++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp-ufs.h | 38 ++++++++++ include/linux/firmware/xlnx-zynqmp.h | 1 + 4 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 drivers/firmware/xilinx/zynqmp-ufs.c create mode 100644 include/linux/firmware/xlnx-zynqmp-ufs.h (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/Makefile b/drivers/firmware/xilinx/Makefile index 875a53703c82..70f8f02f14a3 100644 --- a/drivers/firmware/xilinx/Makefile +++ b/drivers/firmware/xilinx/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Xilinx firmwares -obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o +obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o zynqmp-ufs.o obj-$(CONFIG_ZYNQMP_FIRMWARE_DEBUG) += zynqmp-debug.o diff --git a/drivers/firmware/xilinx/zynqmp-ufs.c b/drivers/firmware/xilinx/zynqmp-ufs.c new file mode 100644 index 000000000000..85da8a822f3a --- /dev/null +++ b/drivers/firmware/xilinx/zynqmp-ufs.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Firmware Layer for UFS APIs + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include + +/* Register Node IDs */ +#define PM_REGNODE_PMC_IOU_SLCR 0x30000002 /* PMC IOU SLCR */ +#define PM_REGNODE_EFUSE_CACHE 0x30000003 /* EFUSE Cache */ + +/* Register Offsets for PMC IOU SLCR */ +#define SRAM_CSR_OFFSET 0x104C /* SRAM Control and Status */ +#define TXRX_CFGRDY_OFFSET 0x1054 /* M-PHY TX-RX Config ready */ + +/* Masks for SRAM Control and Status Register */ +#define SRAM_CSR_INIT_DONE_MASK BIT(0) /* SRAM initialization done */ +#define SRAM_CSR_EXT_LD_DONE_MASK BIT(1) /* SRAM External load done */ +#define SRAM_CSR_BYPASS_MASK BIT(2) /* Bypass SRAM interface */ + +/* Mask to check M-PHY TX-RX configuration readiness */ +#define TX_RX_CFG_RDY_MASK GENMASK(3, 0) + +/* Register Offsets for EFUSE Cache */ +#define UFS_CAL_1_OFFSET 0xBE8 /* UFS Calibration Value */ + +/** + * zynqmp_pm_is_mphy_tx_rx_config_ready - check M-PHY TX-RX config readiness + * @is_ready: Store output status (true/false) + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + u32 regval; + int ret; + + if (!is_ready) + return -EINVAL; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, TXRX_CFGRDY_OFFSET, ®val); + if (ret) + return ret; + + regval &= TX_RX_CFG_RDY_MASK; + if (regval) + *is_ready = true; + else + *is_ready = false; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_is_mphy_tx_rx_config_ready); + +/** + * zynqmp_pm_is_sram_init_done - check SRAM initialization + * @is_done: Store output status (true/false) + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + u32 regval; + int ret; + + if (!is_done) + return -EINVAL; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, ®val); + if (ret) + return ret; + + regval &= SRAM_CSR_INIT_DONE_MASK; + if (regval) + *is_done = true; + else + *is_done = false; + + return ret; +} +EXPORT_SYMBOL_GPL(zynqmp_pm_is_sram_init_done); + +/** + * zynqmp_pm_set_sram_bypass - Set SRAM bypass Control + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_set_sram_bypass(void) +{ + u32 sram_csr; + int ret; + + ret = zynqmp_pm_sec_read_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, &sram_csr); + if (ret) + return ret; + + sram_csr &= ~SRAM_CSR_EXT_LD_DONE_MASK; + sram_csr |= SRAM_CSR_BYPASS_MASK; + + return zynqmp_pm_sec_mask_write_reg(PM_REGNODE_PMC_IOU_SLCR, SRAM_CSR_OFFSET, + GENMASK(2, 1), sram_csr); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_set_sram_bypass); + +/** + * zynqmp_pm_get_ufs_calibration_values - Read UFS calibration values + * @val: Store the calibration value + * + * Return: Returns 0 on success or error value on failure. + */ +int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return zynqmp_pm_sec_read_reg(PM_REGNODE_EFUSE_CACHE, UFS_CAL_1_OFFSET, val); +} +EXPORT_SYMBOL_GPL(zynqmp_pm_get_ufs_calibration_values); diff --git a/include/linux/firmware/xlnx-zynqmp-ufs.h b/include/linux/firmware/xlnx-zynqmp-ufs.h new file mode 100644 index 000000000000..d3538dd5822a --- /dev/null +++ b/include/linux/firmware/xlnx-zynqmp-ufs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Firmware layer for UFS APIs. + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __FIRMWARE_XLNX_ZYNQMP_UFS_H__ +#define __FIRMWARE_XLNX_ZYNQMP_UFS_H__ + +#if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready); +int zynqmp_pm_is_sram_init_done(bool *is_done); +int zynqmp_pm_set_sram_bypass(void); +int zynqmp_pm_get_ufs_calibration_values(u32 *val); +#else +static inline int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_sram_bypass(void) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return -ENODEV; +} +#endif + +#endif /* __FIRMWARE_XLNX_ZYNQMP_UFS_H__ */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index b161f37de5cc..784d5920b4cd 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -16,6 +16,7 @@ #include #include +#include #define ZYNQMP_PM_VERSION_MAJOR 1 #define ZYNQMP_PM_VERSION_MINOR 0 -- cgit v1.2.3 From 74a7b4f18396f07e87c7fda5c19d1fcfb8c1dd44 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:08:02 -0700 Subject: sysctl: fix kernel-doc format warning Describe the "type" struct member using '@type' and move it together with the rest of the doc for ctl_table_header to avoid a kernel-doc warning: Warning: include/linux/sysctl.h:178 Incorrect use of kernel-doc format: * enum type - Enumeration to differentiate between ctl target types Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration") Signed-off-by: Randy Dunlap Signed-off-by: Joel Granados --- include/linux/sysctl.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 92e9146b1104..28c4a997fd21 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -156,6 +156,10 @@ struct ctl_node { * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * + * @type: Enumeration to differentiate between ctl target types + * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations + * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir + * target to serve as a mount point */ struct ctl_table_header { union { @@ -175,13 +179,6 @@ struct ctl_table_header { struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ - /** - * enum type - Enumeration to differentiate between ctl target types - * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations - * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently - * empty directory target to serve - * as mount point. - */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY, -- cgit v1.2.3 From 0de4c70d04a46a3c266547dd4275ce25f623796a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: tracing: fprobe: use rhltable for fprobe_ip_table For now, all the kernel functions who are hooked by the fprobe will be added to the hash table "fprobe_ip_table". The key of it is the function address, and the value of it is "struct fprobe_hlist_node". The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And this means the overhead of the hash table lookup will grow linearly if the count of the functions in the fprobe more than 256. When we try to hook all the kernel functions, the overhead will be huge. Therefore, replace the hash table with rhltable to reduce the overhead. Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 3 +- kernel/trace/fprobe.c | 157 ++++++++++++++++++++++++++++--------------------- 2 files changed, 93 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 7964db96e41a..0a3bcd1718f3 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct fprobe; @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, * @fp: The fprobe which owns this. */ struct fprobe_hlist_node { - struct hlist_node hlist; + struct rhlist_head hlist; unsigned long addr; struct fprobe *fp; }; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 5a807d62e76d..e063e22e1134 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -41,60 +42,67 @@ * - RCU hlist traversal under disabling preempt */ static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; -static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static struct rhltable fprobe_ip_table; static DEFINE_MUTEX(fprobe_mutex); -/* - * Find first fprobe in the hlist. It will be iterated twice in the entry - * probe, once for correcting the total required size, the second time is - * calling back the user handlers. - * Thus the hlist in the fprobe_table must be sorted and new probe needs to - * be added *before* the first fprobe. - */ -static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +static u32 fprobe_node_hashfn(const void *data, u32 len, u32 seed) { - struct fprobe_hlist_node *node; - struct hlist_head *head; + return hash_ptr(*(unsigned long **)data, 32); +} - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (node->addr == ip) - return node; - } - return NULL; +static int fprobe_node_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + unsigned long key = *(unsigned long *)arg->key; + const struct fprobe_hlist_node *n = ptr; + + return n->addr != key; } -NOKPROBE_SYMBOL(find_first_fprobe_node); -/* Node insertion and deletion requires the fprobe_mutex */ -static void insert_fprobe_node(struct fprobe_hlist_node *node) +static u32 fprobe_node_obj_hashfn(const void *data, u32 len, u32 seed) { - unsigned long ip = node->addr; - struct fprobe_hlist_node *next; - struct hlist_head *head; + const struct fprobe_hlist_node *n = data; + + return hash_ptr((void *)n->addr, 32); +} +static const struct rhashtable_params fprobe_rht_params = { + .head_offset = offsetof(struct fprobe_hlist_node, hlist), + .key_offset = offsetof(struct fprobe_hlist_node, addr), + .key_len = sizeof_field(struct fprobe_hlist_node, addr), + .hashfn = fprobe_node_hashfn, + .obj_hashfn = fprobe_node_obj_hashfn, + .obj_cmpfn = fprobe_node_cmp, + .automatic_shrinking = true, +}; + +/* Node insertion and deletion requires the fprobe_mutex */ +static int insert_fprobe_node(struct fprobe_hlist_node *node) +{ lockdep_assert_held(&fprobe_mutex); - next = find_first_fprobe_node(ip); - if (next) { - hlist_add_before_rcu(&node->hlist, &next->hlist); - return; - } - head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; - hlist_add_head_rcu(&node->hlist, head); + return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params); } /* Return true if there are synonims */ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); + bool ret; /* Avoid double deleting */ if (READ_ONCE(node->fp) != NULL) { WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + rhltable_remove(&fprobe_ip_table, &node->hlist, + fprobe_rht_params); } - return !!find_first_fprobe_node(node->addr); + + rcu_read_lock(); + ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr, + fprobe_rht_params); + rcu_read_unlock(); + + return ret; } /* Check existence of the fprobe */ @@ -249,9 +257,10 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) { - struct fprobe_hlist_node *node, *first; unsigned long *fgraph_data = NULL; unsigned long func = trace->func; + struct fprobe_hlist_node *node; + struct rhlist_head *head, *pos; unsigned long ret_ip; int reserved_words; struct fprobe *fp; @@ -260,14 +269,11 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, if (WARN_ON_ONCE(!fregs)) return 0; - first = node = find_first_fprobe_node(func); - if (unlikely(!first)) - return 0; - + head = rhltable_lookup(&fprobe_ip_table, &func, fprobe_rht_params); reserved_words = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || !fp->exit_handler) continue; @@ -278,13 +284,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, reserved_words += FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); } - node = first; if (reserved_words) { fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); if (unlikely(!fgraph_data)) { - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (fp && !fprobe_disabled(fp)) fp->nmissed++; @@ -299,12 +304,12 @@ static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, */ ret_ip = ftrace_regs_get_return_address(fregs); used = 0; - hlist_for_each_entry_from_rcu(node, hlist) { + rhl_for_each_entry_rcu(node, pos, head, hlist) { int data_size; void *data; if (node->addr != func) - break; + continue; fp = READ_ONCE(node->fp); if (!fp || fprobe_disabled(fp)) continue; @@ -449,25 +454,21 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad return 0; } -static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, - struct fprobe_addr_list *alist) +static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node, + struct fprobe_addr_list *alist) { - struct fprobe_hlist_node *node; int ret = 0; - hlist_for_each_entry_rcu(node, head, hlist, - lockdep_is_held(&fprobe_mutex)) { - if (!within_module(node->addr, mod)) - continue; - if (delete_fprobe_node(node)) - continue; - /* - * If failed to update alist, just continue to update hlist. - * Therefore, at list user handler will not hit anymore. - */ - if (!ret) - ret = fprobe_addr_list_add(alist, node->addr); - } + if (!within_module(node->addr, mod)) + return; + if (delete_fprobe_node(node)) + return; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); } /* Handle module unloading to manage fprobe_ip_table. */ @@ -475,8 +476,9 @@ static int fprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct fprobe_hlist_node *node; + struct rhashtable_iter iter; struct module *mod = data; - int i; if (val != MODULE_STATE_GOING) return NOTIFY_DONE; @@ -487,8 +489,16 @@ static int fprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; mutex_lock(&fprobe_mutex); - for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) - fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + rhltable_walk_enter(&fprobe_ip_table, &iter); + do { + rhashtable_walk_start(&iter); + + while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node)) + fprobe_remove_node_in_module(mod, node, &alist); + + rhashtable_walk_stop(&iter); + } while (node == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, @@ -728,8 +738,16 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) ret = fprobe_graph_add_ips(addrs, num); if (!ret) { add_fprobe_hash(fp); - for (i = 0; i < hlist_array->size; i++) - insert_fprobe_node(&hlist_array->array[i]); + for (i = 0; i < hlist_array->size; i++) { + ret = insert_fprobe_node(&hlist_array->array[i]); + if (ret) + break; + } + /* fallback on insert error */ + if (ret) { + for (i--; i >= 0; i--) + delete_fprobe_node(&hlist_array->array[i]); + } } mutex_unlock(&fprobe_mutex); @@ -825,3 +843,10 @@ out: return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); + +static int __init fprobe_initcall(void) +{ + rhltable_init(&fprobe_ip_table, &fprobe_rht_params); + return 0; +} +late_initcall(fprobe_initcall); -- cgit v1.2.3 From 652a86b24c5ac444afaf7625c9340d55aab7f105 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 31 Oct 2025 14:08:32 +0100 Subject: err.h: add INIT_ERR_PTR() macro Add INIT_ERR_PTR() macro to initialize static variables with error pointers. This might be useful for specific case where there is a static variable initialized to an error condition and then later set to the real handle once probe finish/completes. This is to handle compilation problems like: error: initializer element is not constant where ERR_PTR() can't be used. Signed-off-by: Christian Marangi Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20251031130835.7953-2-ansuelsmth@gmail.com [bjorn: Added () suffix on macro references] Signed-off-by: Bjorn Andersson --- include/linux/err.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index 1d60aa86db53..8c37be0620ab 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,14 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/** + * INIT_ERR_PTR - Init a const error pointer. + * @error: A negative error code. + * + * Like ERR_PTR(), but usable to initialize static variables. + */ +#define INIT_ERR_PTR(error) ((void *)(error)) + /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) -- cgit v1.2.3 From 8ed6b8842c44a4a716dfd536e7f13aff77039a02 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 25 Sep 2025 22:09:56 +0300 Subject: power: supply: max77705_charger: implement aicl feature Adaptive input current allows charger to reduce it's current consumption, when source is not able to provide enough power. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250925-max77705_77976_charger_improvement-v6-1-972c716c17d1@gmail.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 42 +++++++++++++++++++++++++++++++++ include/linux/power/max77705_charger.h | 2 ++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index b1a227bf72e2..35cdb10a0e89 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -40,6 +40,39 @@ static enum power_supply_property max77705_charger_props[] = { POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, }; +static irqreturn_t max77705_aicl_irq(int irq, void *irq_drv_data) +{ + struct max77705_charger_data *chg = irq_drv_data; + unsigned int regval, irq_status; + int err; + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + + // irq is fiered at the end of current decrease sequence too + // early check AICL_I bit to guard against that excess irq call + while (!(irq_status & BIT(MAX77705_AICL_I))) { + err = regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®val); + if (err < 0) + return IRQ_HANDLED; + + regval--; + + err = regmap_field_write(chg->rfield[MAX77705_CHG_CHGIN_LIM], regval); + if (err < 0) + return IRQ_HANDLED; + + msleep(AICL_WORK_DELAY_MS); + + err = regmap_read(chg->regmap, MAX77705_CHG_REG_INT_OK, &irq_status); + if (err < 0) + return IRQ_HANDLED; + } + + return IRQ_HANDLED; +} + static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) { struct max77705_charger_data *chg = irq_drv_data; @@ -632,6 +665,15 @@ static int max77705_charger_probe(struct i2c_client *i2c) goto destroy_wq; } + ret = devm_request_threaded_irq(dev, regmap_irq_get_virq(irq_data, MAX77705_AICL_I), + NULL, max77705_aicl_irq, + IRQF_TRIGGER_NONE, + "aicl-irq", chg); + if (ret) { + dev_err_probe(dev, ret, "Failed to Request aicl IRQ\n"); + goto destroy_wq; + } + ret = max77705_charger_enable(chg); if (ret) { dev_err_probe(dev, ret, "failed to enable charge\n"); diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index 6653abfdf747..b3950ce0625e 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -123,6 +123,8 @@ #define MAX77705_DISABLE_SKIP 1 #define MAX77705_AUTO_SKIP 0 +#define AICL_WORK_DELAY_MS 100 + /* uA */ #define MAX77705_CURRENT_CHGIN_STEP 25000 #define MAX77705_CURRENT_CHG_STEP 50000 -- cgit v1.2.3 From 695f2ca1b4247724576d57eae7b74b90dc69ba3c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:16 -0500 Subject: fs/fs_parse: add back fsparam_u32hex 296b67059 removed fsparam_u32hex because there were no callers (yet) and it didn't build due to using the nonexistent symbol fs_param_is_u32_hex. fs/9p will need this parser, so add it back with the appropriate fix (use fs_param_is_u32). Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-2-sandeen@redhat.com> Signed-off-by: Dominique Martinet --- include/linux/fs_parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5a0e897cae80..5e8a3b546033 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -120,6 +120,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL) #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) +#define fsparam_u32hex(NAME, OPT) \ + __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From 56d9df41ef1847ed0523f57ec6117649d581401d Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sat, 1 Nov 2025 01:45:04 +0100 Subject: rtc: ds1685: stop setting max_user_freq max_user_freq has not been related to the hardware RTC since commit 6610e0893b8b ("RTC: Rework RTC code to use timerqueue for events"). Stop setting it from individual driver to avoid confusing new contributors. Acked-by: Joshua Kinard Link: https://patch.msgid.link/20251101-max_user_freq-v1-2-c9a274fd6883@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1685.c | 3 --- include/linux/rtc/ds1685.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c index 97423f1d0361..5fc8e36b1abf 100644 --- a/drivers/rtc/rtc-ds1685.c +++ b/drivers/rtc/rtc-ds1685.c @@ -1268,9 +1268,6 @@ ds1685_rtc_probe(struct platform_device *pdev) rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_2000; rtc_dev->range_max = RTC_TIMESTAMP_END_2099; - /* Maximum periodic rate is 8192Hz (0.122070ms). */ - rtc_dev->max_user_freq = RTC_MAX_USER_FREQ; - /* See if the platform doesn't support UIE. */ if (pdata->uie_unsupported) clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc_dev->features); diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h index 01da4582db6d..8ec0ebfaef04 100644 --- a/include/linux/rtc/ds1685.h +++ b/include/linux/rtc/ds1685.h @@ -324,7 +324,6 @@ struct ds1685_rtc_platform_data { #define RTC_SQW_2HZ 0x0f /* 0 1 1 1 1 */ #define RTC_SQW_0HZ 0x00 /* 0 0 0 0 0 */ #define RTC_SQW_32768HZ 32768 /* 1 - - - - */ -#define RTC_MAX_USER_FREQ 8192 /* -- cgit v1.2.3 From 85d55d8cc3ef7f77b249c97e9fac6a0fc5f5daa7 Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Tue, 30 Sep 2025 11:18:06 +0530 Subject: soc: qcom: ubwc: Add config for Kaanapali Add the ubwc configuration for Kaanapali chipset. This chipset brings support for UBWC v6 version. The rest of the configurations remains as usual. Signed-off-by: Akhil P Oommen Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250930-kaana-gpu-support-v1-1-73530b0700ed@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/ubwc_config.c | 11 +++++++++++ include/linux/soc/qcom/ubwc.h | 1 + 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/ubwc_config.c b/drivers/soc/qcom/ubwc_config.c index 942fe6c17612..1c09796163b0 100644 --- a/drivers/soc/qcom/ubwc_config.c +++ b/drivers/soc/qcom/ubwc_config.c @@ -16,6 +16,16 @@ static const struct qcom_ubwc_cfg_data no_ubwc_data = { /* no UBWC, no HBB */ }; +static const struct qcom_ubwc_cfg_data kaanapali_data = { + .ubwc_enc_version = UBWC_6_0, + .ubwc_dec_version = UBWC_6_0, + .ubwc_swizzle = UBWC_SWIZZLE_ENABLE_LVL2 | + UBWC_SWIZZLE_ENABLE_LVL3, + .ubwc_bank_spread = true, + .highest_bank_bit = 16, + .macrotile_mode = true, +}; + static const struct qcom_ubwc_cfg_data msm8937_data = { .ubwc_enc_version = UBWC_1_0, .ubwc_dec_version = UBWC_1_0, @@ -234,6 +244,7 @@ static const struct of_device_id qcom_ubwc_configs[] __maybe_unused = { { .compatible = "qcom,apq8026", .data = &no_ubwc_data }, { .compatible = "qcom,apq8074", .data = &no_ubwc_data }, { .compatible = "qcom,apq8096", .data = &msm8998_data }, + { .compatible = "qcom,kaanapali", .data = &kaanapali_data, }, { .compatible = "qcom,glymur", .data = &glymur_data}, { .compatible = "qcom,msm8226", .data = &no_ubwc_data }, { .compatible = "qcom,msm8916", .data = &no_ubwc_data }, diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index 1ed8b1b16bc9..0a4edfe3d96d 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -52,6 +52,7 @@ struct qcom_ubwc_cfg_data { #define UBWC_4_0 0x40000000 #define UBWC_4_3 0x40030000 #define UBWC_5_0 0x50000000 +#define UBWC_6_0 0x60000000 #if IS_ENABLED(CONFIG_QCOM_UBWC_CONFIG) const struct qcom_ubwc_cfg_data *qcom_ubwc_config_get_data(void); -- cgit v1.2.3 From 603c646f001008eaf8b5a7a888043e5cc8c494a2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:53 -0700 Subject: coco/tsm: Introduce a core device for TEE Security Managers A "TSM" is a platform component that provides an API for securely provisioning resources for a confidential guest (TVM) to consume. The name originates from the PCI specification for platform agent that carries out operations for PCIe TDISP (TEE Device Interface Security Protocol). Instances of this core device are parented by a device representing the platform security function like CONFIG_CRYPTO_DEV_CCP or CONFIG_INTEL_TDX_HOST. This device interface is a frontend to the aspects of a TSM and TEE I/O that are cross-architecture common. This includes mechanisms like enumerating available platform TEE I/O capabilities and provisioning connections between the platform TSM and device DSMs (Device Security Manager (TDISP)). For now this is just the scaffolding for registering a TSM device sysfs interface. Cc: Xu Yilun Reviewed-by: Jonathan Cameron Co-developed-by: Aneesh Kumar K.V (Arm) Signed-off-by: Aneesh Kumar K.V (Arm) Acked-by: Bjorn Helgaas Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-2-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-class-tsm | 9 +++ MAINTAINERS | 2 +- drivers/virt/coco/Kconfig | 3 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/tsm-core.c | 93 +++++++++++++++++++++++++++++++ include/linux/tsm.h | 11 ++++ 6 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-class-tsm create mode 100644 drivers/virt/coco/tsm-core.c (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm new file mode 100644 index 000000000000..2949468deaf7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-tsm @@ -0,0 +1,9 @@ +What: /sys/class/tsm/tsmN +Contact: linux-coco@lists.linux.dev +Description: + "tsmN" is a device that represents the generic attributes of a + platform TEE Security Manager. It is typically a child of a + platform enumerated TSM device. /sys/class/tsm/tsmN/uevent + signals when the PCI layer is able to support establishment of + link encryption and other device-security features coordinated + through a platform tsm. diff --git a/MAINTAINERS b/MAINTAINERS index 46bd8e033042..b8c9929532ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26112,7 +26112,7 @@ M: David Lechner S: Maintained F: Documentation/devicetree/bindings/trigger-source/* -TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE +TRUSTED EXECUTION ENVIRONMENT SECURITY MANAGER (TSM) M: Dan Williams L: linux-coco@lists.linux.dev S: Maintained diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index 819a97e8ba99..bb0c6d6ddcc8 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -14,3 +14,6 @@ source "drivers/virt/coco/tdx-guest/Kconfig" source "drivers/virt/coco/arm-cca-guest/Kconfig" source "drivers/virt/coco/guest/Kconfig" + +config TSM + bool diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index f918bbb61737..cb52021912b3 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/ +obj-$(CONFIG_TSM) += tsm-core.o obj-$(CONFIG_TSM_GUEST) += guest/ diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c new file mode 100644 index 000000000000..347507cc5e3f --- /dev/null +++ b/drivers/virt/coco/tsm-core.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +static struct class *tsm_class; +static DECLARE_RWSEM(tsm_rwsem); +static DEFINE_IDA(tsm_ida); + +static struct tsm_dev *alloc_tsm_dev(struct device *parent) +{ + struct device *dev; + int id; + + struct tsm_dev *tsm_dev __free(kfree) = + kzalloc(sizeof(*tsm_dev), GFP_KERNEL); + if (!tsm_dev) + return ERR_PTR(-ENOMEM); + + id = ida_alloc(&tsm_ida, GFP_KERNEL); + if (id < 0) + return ERR_PTR(id); + + tsm_dev->id = id; + dev = &tsm_dev->dev; + dev->parent = parent; + dev->class = tsm_class; + device_initialize(dev); + + return no_free_ptr(tsm_dev); +} + +struct tsm_dev *tsm_register(struct device *parent) +{ + struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent); + struct device *dev; + int rc; + + if (IS_ERR(tsm_dev)) + return tsm_dev; + + dev = &tsm_dev->dev; + rc = dev_set_name(dev, "tsm%d", tsm_dev->id); + if (rc) + return ERR_PTR(rc); + + rc = device_add(dev); + if (rc) + return ERR_PTR(rc); + + return no_free_ptr(tsm_dev); +} +EXPORT_SYMBOL_GPL(tsm_register); + +void tsm_unregister(struct tsm_dev *tsm_dev) +{ + device_unregister(&tsm_dev->dev); +} +EXPORT_SYMBOL_GPL(tsm_unregister); + +static void tsm_release(struct device *dev) +{ + struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev); + + ida_free(&tsm_ida, tsm_dev->id); + kfree(tsm_dev); +} + +static int __init tsm_init(void) +{ + tsm_class = class_create("tsm"); + if (IS_ERR(tsm_class)) + return PTR_ERR(tsm_class); + + tsm_class->dev_release = tsm_release; + return 0; +} +module_init(tsm_init) + +static void __exit tsm_exit(void) +{ + class_destroy(tsm_class); +} +module_exit(tsm_exit) + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TEE Security Manager Class Device"); diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 431054810dca..cd97c63ffa32 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -5,6 +5,7 @@ #include #include #include +#include #define TSM_REPORT_INBLOB_MAX 64 #define TSM_REPORT_OUTBLOB_MAX SZ_32K @@ -107,6 +108,16 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct tsm_dev { + struct device dev; + int id; +}; + +DEFINE_FREE(put_tsm_dev, struct tsm_dev *, + if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) + int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); +struct tsm_dev *tsm_register(struct device *parent); +void tsm_unregister(struct tsm_dev *tsm_dev); #endif /* __TSM_H */ -- cgit v1.2.3 From f16469ee733ac52b2373216803699cbb05e82786 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:54 -0700 Subject: PCI/IDE: Enumerate Selective Stream IDE capabilities Link encryption is a new PCIe feature enumerated by "PCIe r7.0 section 7.9.26 IDE Extended Capability". It is both a standalone port + endpoint capability, and a building block for the security protocol defined by "PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP)". That protocol coordinates device security setup between a platform TSM (TEE Security Manager) and a device DSM (Device Security Manager). While the platform TSM can allocate resources like Stream ID and manage keys, it still requires system software to manage the IDE capability register block. Add register definitions and basic enumeration in preparation for Selective IDE Stream establishment. A follow on change selects the new CONFIG_PCI_IDE symbol. Note that while the IDE specification defines both a point-to-point "Link Stream" and a Root Port to endpoint "Selective Stream", only "Selective Stream" is considered for Linux as that is the predominant mode expected by Trusted Execution Environment Security Managers (TSMs), and it is the security model that limits the number of PCI components within the TCB in a PCIe topology with switches. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Reviewed-by: Aneesh Kumar K.V Link: https://patch.msgid.link/20251031212902.2256310-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/Kconfig | 3 ++ drivers/pci/Makefile | 1 + drivers/pci/ide.c | 88 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 6 +++ drivers/pci/probe.c | 1 + include/linux/pci.h | 7 ++++ include/uapi/linux/pci_regs.h | 81 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 187 insertions(+) create mode 100644 drivers/pci/ide.c (limited to 'include/linux') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index f94f5d384362..b28423e2057f 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -122,6 +122,9 @@ config XEN_PCIDEV_FRONTEND config PCI_ATS bool +config PCI_IDE + bool + config PCI_DOE bool "Enable PCI Data Object Exchange (DOE) support" help diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 67647f1880fb..6612256fd37d 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_PCI_P2PDMA) += p2pdma.o obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o obj-$(CONFIG_VGA_ARB) += vgaarb.o obj-$(CONFIG_PCI_DOE) += doe.o +obj-$(CONFIG_PCI_IDE) += ide.o obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c new file mode 100644 index 000000000000..26866edf91b4 --- /dev/null +++ b/drivers/pci/ide.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */ + +/* PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) */ + +#define dev_fmt(fmt) "PCI/IDE: " fmt +#include +#include +#include + +#include "pci.h" + +static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index, + u8 nr_ide_mem) +{ + u32 offset = ide_cap + PCI_IDE_LINK_STREAM_0 + + nr_link_ide * PCI_IDE_LINK_BLOCK_SIZE; + + /* + * Assume a constant number of address association resources per stream + * index + */ + return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem); +} + +void pci_ide_init(struct pci_dev *pdev) +{ + u16 nr_link_ide, nr_ide_mem, nr_streams; + u16 ide_cap; + u32 val; + + if (!pci_is_pcie(pdev)) + return; + + ide_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_IDE); + if (!ide_cap) + return; + + pci_read_config_dword(pdev, ide_cap + PCI_IDE_CAP, &val); + if ((val & PCI_IDE_CAP_SELECTIVE) == 0) + return; + + /* + * Require endpoint IDE capability to be paired with IDE Root Port IDE + * capability. + */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) { + struct pci_dev *rp = pcie_find_root_port(pdev); + + if (!rp->ide_cap) + return; + } + + pdev->ide_cfg = FIELD_GET(PCI_IDE_CAP_SEL_CFG, val); + pdev->ide_tee_limit = FIELD_GET(PCI_IDE_CAP_TEE_LIMITED, val); + + if (val & PCI_IDE_CAP_LINK) + nr_link_ide = 1 + FIELD_GET(PCI_IDE_CAP_LINK_TC_NUM, val); + else + nr_link_ide = 0; + + nr_ide_mem = 0; + nr_streams = 1 + FIELD_GET(PCI_IDE_CAP_SEL_NUM, val); + for (u16 i = 0; i < nr_streams; i++) { + int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); + int nr_assoc; + u32 val; + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); + + /* + * Let's not entertain streams that do not have a constant + * number of address association blocks + */ + nr_assoc = FIELD_GET(PCI_IDE_SEL_CAP_ASSOC_NUM, val); + if (i && (nr_assoc != nr_ide_mem)) { + pci_info(pdev, "Unsupported Selective Stream %d capability, SKIP the rest\n", i); + nr_streams = i; + break; + } + + nr_ide_mem = nr_assoc; + } + + pdev->ide_cap = ide_cap; + pdev->nr_link_ide = nr_link_ide; + pdev->nr_ide_mem = nr_ide_mem; +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4492b809094b..86ef13e7cece 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -613,6 +613,12 @@ static inline void pci_doe_sysfs_init(struct pci_dev *pdev) { } static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCI_IDE +void pci_ide_init(struct pci_dev *dev); +#else +static inline void pci_ide_init(struct pci_dev *dev) { } +#endif + /** * pci_dev_set_io_state - Set the new error state if possible. * diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0ce98e18b5a8..4c55020f3ddf 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2667,6 +2667,7 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_doe_init(dev); /* Data Object Exchange */ pci_tph_init(dev); /* TLP Processing Hints */ pci_rebar_init(dev); /* Resizable BAR */ + pci_ide_init(dev); /* Link Integrity and Data Encryption */ pcie_report_downtraining(dev); pci_init_reset_methods(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..4402ca931124 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -539,6 +539,13 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_NPEM struct npem *npem; /* Native PCIe Enclosure Management */ +#endif +#ifdef CONFIG_PCI_IDE + u16 ide_cap; /* Link Integrity & Data Encryption */ + u8 nr_ide_mem; /* Address association resources for streams */ + u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + unsigned int ide_cfg:1; /* Config cycles over IDE */ + unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 07e06aafec50..05bd22d9e352 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -754,6 +754,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1249,4 +1250,84 @@ #define PCI_DVSEC_CXL_PORT_CTL 0x0c #define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 +/* Integrity and Data Encryption Extended Capability */ +#define PCI_IDE_CAP 0x04 +#define PCI_IDE_CAP_LINK 0x1 /* Link IDE Stream Supported */ +#define PCI_IDE_CAP_SELECTIVE 0x2 /* Selective IDE Streams Supported */ +#define PCI_IDE_CAP_FLOWTHROUGH 0x4 /* Flow-Through IDE Stream Supported */ +#define PCI_IDE_CAP_PARTIAL_HEADER_ENC 0x8 /* Partial Header Encryption Supported */ +#define PCI_IDE_CAP_AGGREGATION 0x10 /* Aggregation Supported */ +#define PCI_IDE_CAP_PCRC 0x20 /* PCRC Supported */ +#define PCI_IDE_CAP_IDE_KM 0x40 /* IDE_KM Protocol Supported */ +#define PCI_IDE_CAP_SEL_CFG 0x80 /* Selective IDE for Config Request Support */ +#define PCI_IDE_CAP_ALG __GENMASK(12, 8) /* Supported Algorithms */ +#define PCI_IDE_CAP_ALG_AES_GCM_256 0 /* AES-GCM 256 key size, 96b MAC */ +#define PCI_IDE_CAP_LINK_TC_NUM __GENMASK(15, 13) /* Link IDE TCs */ +#define PCI_IDE_CAP_SEL_NUM __GENMASK(23, 16) /* Supported Selective IDE Streams */ +#define PCI_IDE_CAP_TEE_LIMITED 0x1000000 /* TEE-Limited Stream Supported */ +#define PCI_IDE_CTL 0x08 +#define PCI_IDE_CTL_FLOWTHROUGH_IDE 0x4 /* Flow-Through IDE Stream Enabled */ + +#define PCI_IDE_LINK_STREAM_0 0xc /* First Link Stream Register Block */ +#define PCI_IDE_LINK_BLOCK_SIZE 8 +/* Link IDE Stream block, up to PCI_IDE_CAP_LINK_TC_NUM */ +#define PCI_IDE_LINK_CTL_0 0x00 /* First Link Control Register Offset in block */ +#define PCI_IDE_LINK_CTL_EN 0x1 /* Link IDE Stream Enable */ +#define PCI_IDE_LINK_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_LINK_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_LINK_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_LINK_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_LINK_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_LINK_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_LINK_STS_0 0x4 /* First Link Status Register Offset in block */ +#define PCI_IDE_LINK_STS_STATE __GENMASK(3, 0) /* Link IDE Stream State */ +#define PCI_IDE_LINK_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ + +/* Selective IDE Stream block, up to PCI_IDE_CAP_SELECTIVE_STREAMS_NUM */ +/* Selective IDE Stream Capability Register */ +#define PCI_IDE_SEL_CAP 0x00 +#define PCI_IDE_SEL_CAP_ASSOC_NUM __GENMASK(3, 0) +/* Selective IDE Stream Control Register */ +#define PCI_IDE_SEL_CTL 0x04 +#define PCI_IDE_SEL_CTL_EN 0x1 /* Selective IDE Stream Enable */ +#define PCI_IDE_SEL_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_SEL_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_SEL_CTL_CFG_EN 0x200 /* Selective IDE for Configuration Requests */ +#define PCI_IDE_SEL_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_SEL_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_SEL_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_SEL_CTL_DEFAULT 0x400000 /* Default Stream */ +#define PCI_IDE_SEL_CTL_TEE_LIMITED 0x800000 /* TEE-Limited Stream */ +#define PCI_IDE_SEL_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_SEL_CTL_ID_MAX 255 +/* Selective IDE Stream Status Register */ +#define PCI_IDE_SEL_STS 0x08 +#define PCI_IDE_SEL_STS_STATE __GENMASK(3, 0) /* Selective IDE Stream State */ +#define PCI_IDE_SEL_STS_STATE_INSECURE 0 +#define PCI_IDE_SEL_STS_STATE_SECURE 2 +#define PCI_IDE_SEL_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ +/* IDE RID Association Register 1 */ +#define PCI_IDE_SEL_RID_1 0x0c +#define PCI_IDE_SEL_RID_1_LIMIT __GENMASK(23, 8) +/* IDE RID Association Register 2 */ +#define PCI_IDE_SEL_RID_2 0x10 +#define PCI_IDE_SEL_RID_2_VALID 0x1 +#define PCI_IDE_SEL_RID_2_BASE __GENMASK(23, 8) +#define PCI_IDE_SEL_RID_2_SEG __GENMASK(31, 24) +/* Selective IDE Address Association Register Block, up to PCI_IDE_SEL_CAP_ASSOC_NUM */ +#define PCI_IDE_SEL_ADDR_BLOCK_SIZE 12 +#define PCI_IDE_SEL_ADDR_1(x) (20 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_ADDR_1_VALID 0x1 +#define PCI_IDE_SEL_ADDR_1_BASE_LOW __GENMASK(19, 8) +#define PCI_IDE_SEL_ADDR_1_LIMIT_LOW __GENMASK(31, 20) +/* IDE Address Association Register 2 is "Memory Limit Upper" */ +#define PCI_IDE_SEL_ADDR_2(x) (24 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +/* IDE Address Association Register 3 is "Memory Base Upper" */ +#define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) + #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From 215afa89d249bb095126cf00f8be719e421c75e9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:55 -0700 Subject: PCI: Introduce pci_walk_bus_reverse(), for_each_pci_dev_reverse() PCI/TSM, the PCI core functionality for the PCIe TEE Device Interface Security Protocol (TDISP), has a need to walk all subordinate functions of a Device Security Manager (DSM) to setup a device security context. A DSM is physical function 0 of multi-function or SR-IOV device endpoint, or it is an upstream switch port. In error scenarios or when a TEE Security Manager (TSM) device is removed it needs to unwind all established DSM contexts. Introduce reverse versions of PCI device iteration helpers to mirror the setup path and ensure that dependent children are handled before parents. Cc: Greg Kroah-Hartman Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/base/bus.c | 38 ++++++++++++++++++++++++++++ drivers/pci/bus.c | 39 +++++++++++++++++++++++++++++ drivers/pci/search.c | 62 ++++++++++++++++++++++++++++++++++++++++------ include/linux/device/bus.h | 3 +++ include/linux/pci.h | 11 ++++++++ 5 files changed, 145 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 5e75e1bce551..d19dae8f9d1b 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -334,6 +334,19 @@ static struct device *next_device(struct klist_iter *i) return dev; } +static struct device *prev_device(struct klist_iter *i) +{ + struct klist_node *n = klist_prev(i); + struct device *dev = NULL; + struct device_private *dev_prv; + + if (n) { + dev_prv = to_device_private_bus(n); + dev = dev_prv->device; + } + return dev; +} + /** * bus_for_each_dev - device iterator. * @bus: bus type. @@ -414,6 +427,31 @@ struct device *bus_find_device(const struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_find_device); +struct device *bus_find_device_reverse(const struct bus_type *bus, + struct device *start, const void *data, + device_match_t match) +{ + struct subsys_private *sp = bus_to_subsys(bus); + struct klist_iter i; + struct device *dev; + + if (!sp) + return NULL; + + klist_iter_init_node(&sp->klist_devices, &i, + (start ? &start->p->knode_bus : NULL)); + while ((dev = prev_device(&i))) { + if (match(dev, data)) { + get_device(dev); + break; + } + } + klist_iter_exit(&i); + subsys_put(sp); + return dev; +} +EXPORT_SYMBOL_GPL(bus_find_device_reverse); + static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index f26aec6ff588..b8b17f825bc0 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include #include @@ -432,6 +433,27 @@ static int __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void return ret; } +static int __pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), + void *userdata) +{ + struct pci_dev *dev; + int ret = 0; + + list_for_each_entry_reverse(dev, &top->devices, bus_list) { + if (dev->subordinate) { + ret = __pci_walk_bus_reverse(dev->subordinate, cb, + userdata); + if (ret) + break; + } + ret = cb(dev, userdata); + if (ret) + break; + } + return ret; +} + /** * pci_walk_bus - walk devices on/under bus, calling callback. * @top: bus whose devices should be walked @@ -453,6 +475,23 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void } EXPORT_SYMBOL_GPL(pci_walk_bus); +/** + * pci_walk_bus_reverse - walk devices on/under bus, calling callback. + * @top: bus whose devices should be walked + * @cb: callback to be called for each device found + * @userdata: arbitrary pointer to be passed to callback + * + * Same semantics as pci_walk_bus(), but walks the bus in reverse order. + */ +void pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), void *userdata) +{ + down_read(&pci_bus_sem); + __pci_walk_bus_reverse(top, cb, userdata); + up_read(&pci_bus_sem); +} +EXPORT_SYMBOL_GPL(pci_walk_bus_reverse); + void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata) { lockdep_assert_held(&pci_bus_sem); diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 53840634fbfc..e6e84dc62e82 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -282,6 +282,45 @@ static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id, return pdev; } +static struct pci_dev *pci_get_dev_by_id_reverse(const struct pci_device_id *id, + struct pci_dev *from) +{ + struct device *dev; + struct device *dev_start = NULL; + struct pci_dev *pdev = NULL; + + if (from) + dev_start = &from->dev; + dev = bus_find_device_reverse(&pci_bus_type, dev_start, (void *)id, + match_pci_dev_by_id); + if (dev) + pdev = to_pci_dev(dev); + pci_dev_put(from); + return pdev; +} + +enum pci_search_direction { + PCI_SEARCH_FORWARD, + PCI_SEARCH_REVERSE, +}; + +static struct pci_dev *__pci_get_subsys(unsigned int vendor, unsigned int device, + unsigned int ss_vendor, unsigned int ss_device, + struct pci_dev *from, enum pci_search_direction dir) +{ + struct pci_device_id id = { + .vendor = vendor, + .device = device, + .subvendor = ss_vendor, + .subdevice = ss_device, + }; + + if (dir == PCI_SEARCH_FORWARD) + return pci_get_dev_by_id(&id, from); + else + return pci_get_dev_by_id_reverse(&id, from); +} + /** * pci_get_subsys - begin or continue searching for a PCI device by vendor/subvendor/device/subdevice id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids @@ -302,14 +341,8 @@ struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) { - struct pci_device_id id = { - .vendor = vendor, - .device = device, - .subvendor = ss_vendor, - .subdevice = ss_device, - }; - - return pci_get_dev_by_id(&id, from); + return __pci_get_subsys(vendor, device, ss_vendor, ss_device, from, + PCI_SEARCH_FORWARD); } EXPORT_SYMBOL(pci_get_subsys); @@ -334,6 +367,19 @@ struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, } EXPORT_SYMBOL(pci_get_device); +/* + * Same semantics as pci_get_device(), except walks the PCI device list + * in reverse discovery order. + */ +struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, + struct pci_dev *from) +{ + return __pci_get_subsys(vendor, device, PCI_ANY_ID, PCI_ANY_ID, from, + PCI_SEARCH_REVERSE); +} +EXPORT_SYMBOL(pci_get_device_reverse); + /** * pci_get_class - begin or continue searching for a PCI device by class * @class: search for a PCI device with this class designation diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index f5a56efd2bd6..99b1002b3e31 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -150,6 +150,9 @@ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, device_iter_t fn); struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match); +struct device *bus_find_device_reverse(const struct bus_type *bus, + struct device *start, const void *data, + device_match_t match); /** * bus_find_device_by_name - device iterator for locating a particular device * of a specific name. diff --git a/include/linux/pci.h b/include/linux/pci.h index 4402ca931124..b6a12a82be12 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -582,6 +582,8 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus); #define to_pci_dev(n) container_of(n, struct pci_dev, dev) #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) +#define for_each_pci_dev_reverse(d) \ + while ((d = pci_get_device_reverse(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) static inline int pci_channel_offline(struct pci_dev *pdev) { @@ -1242,6 +1244,8 @@ u64 pci_get_dsn(struct pci_dev *dev); struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, struct pci_dev *from); +struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device, + struct pci_dev *from); struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from); @@ -1661,6 +1665,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); +void pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, @@ -2049,6 +2055,11 @@ static inline struct pci_dev *pci_get_device(unsigned int vendor, struct pci_dev *from) { return NULL; } +static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, + struct pci_dev *from) +{ return NULL; } + static inline struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, -- cgit v1.2.3 From 3225f52cde56f46789a4972d3c54df8a4d75f022 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:56 -0700 Subject: PCI/TSM: Establish Secure Sessions and Link Encryption The PCIe 7.0 specification, section 11, defines the Trusted Execution Environment (TEE) Device Interface Security Protocol (TDISP). This protocol definition builds upon Component Measurement and Authentication (CMA), and link Integrity and Data Encryption (IDE). It adds support for assigning devices (PCI physical or virtual function) to a confidential VM such that the assigned device is enabled to access guest private memory protected by technologies like Intel TDX, AMD SEV-SNP, RISCV COVE, or ARM CCA. The "TSM" (TEE Security Manager) is a concept in the TDISP specification of an agent that mediates between a "DSM" (Device Security Manager) and system software in both a VMM and a confidential VM. A VMM uses TSM ABIs to setup link security and assign devices. A confidential VM uses TSM ABIs to transition an assigned device into the TDISP "RUN" state and validate its configuration. From a Linux perspective the TSM abstracts many of the details of TDISP, IDE, and CMA. Some of those details leak through at times, but for the most part TDISP is an internal implementation detail of the TSM. CONFIG_PCI_TSM adds an "authenticated" attribute and "tsm/" subdirectory to pci-sysfs. Consider that the TSM driver may itself be a PCI driver. Userspace can watch for the arrival of a "TSM" device, /sys/class/tsm/tsm0/uevent KOBJ_CHANGE, to know when the PCI core has initialized TSM services. The operations that can be executed against a PCI device are split into two mutually exclusive operation sets, "Link" and "Security" (struct pci_tsm_{link,security}_ops). The "Link" operations manage physical link security properties and communication with the device's Device Security Manager firmware. These are the host side operations in TDISP. The "Security" operations coordinate the security state of the assigned virtual device (TDI). These are the guest side operations in TDISP. Only "link" (Secure Session and physical Link Encryption) operations are defined at this stage. There are placeholders for the device security (Trusted Computing Base entry / exit) operations. The locking allows for multiple devices to be executing commands simultaneously, one outstanding command per-device and an rwsem synchronizes the implementation relative to TSM registration/unregistration events. Thanks to Wu Hao for his work on an early draft of this support. Cc: Lukas Wunner Cc: Samuel Ortiz Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Link: https://patch.msgid.link/20251031212902.2256310-5-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-pci | 51 +++ Documentation/driver-api/pci/index.rst | 1 + Documentation/driver-api/pci/tsm.rst | 21 ++ MAINTAINERS | 4 +- drivers/pci/Kconfig | 15 + drivers/pci/Makefile | 1 + drivers/pci/doe.c | 2 - drivers/pci/pci-sysfs.c | 4 + drivers/pci/pci.h | 10 + drivers/pci/probe.c | 3 + drivers/pci/remove.c | 6 + drivers/pci/tsm.c | 643 ++++++++++++++++++++++++++++++++ drivers/virt/coco/tsm-core.c | 46 ++- include/linux/pci-doe.h | 4 + include/linux/pci-tsm.h | 157 ++++++++ include/linux/pci.h | 3 + include/linux/tsm.h | 5 +- include/uapi/linux/pci_regs.h | 1 + 18 files changed, 971 insertions(+), 6 deletions(-) create mode 100644 Documentation/driver-api/pci/tsm.rst create mode 100644 drivers/pci/tsm.c create mode 100644 include/linux/pci-tsm.h (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 92debe879ffb..6ffe02f854d6 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -621,3 +621,54 @@ Description: number extended capability. The file is read only and due to the possible sensitivity of accessible serial numbers, admin only. + +What: /sys/bus/pci/devices/.../tsm/ +Contact: linux-coco@lists.linux.dev +Description: + This directory only appears if a physical device function + supports authentication (PCIe CMA-SPDM), interface security + (PCIe TDISP), and is accepted for secure operation by the + platform TSM driver. This attribute directory appears + dynamically after the platform TSM driver loads. So, only after + the /sys/class/tsm/tsm0 device arrives can tools assume that + devices without a tsm/ attribute directory will never have one; + before that, the security capabilities of the device relative to + the platform TSM are unknown. See + Documentation/ABI/testing/sysfs-class-tsm. + +What: /sys/bus/pci/devices/.../tsm/connect +Contact: linux-coco@lists.linux.dev +Description: + (RW) Write the name of a TSM (TEE Security Manager) device from + /sys/class/tsm to this file to establish a connection with the + device. This typically includes an SPDM (DMTF Security + Protocols and Data Models) session over PCIe DOE (Data Object + Exchange) and may also include PCIe IDE (Integrity and Data + Encryption) establishment. Reads from this attribute return the + name of the connected TSM or the empty string if not + connected. A TSM device signals its readiness to accept PCI + connection via a KOBJ_CHANGE event. + +What: /sys/bus/pci/devices/.../tsm/disconnect +Contact: linux-coco@lists.linux.dev +Description: + (WO) Write the name of the TSM device that was specified + to 'connect' to teardown the connection. + +What: /sys/bus/pci/devices/.../authenticated +Contact: linux-pci@vger.kernel.org +Description: + When the device's tsm/ directory is present device + authentication (PCIe CMA-SPDM) and link encryption (PCIe IDE) + are handled by the platform TSM (TEE Security Manager). When the + tsm/ directory is not present this attribute reflects only the + native CMA-SPDM authentication state with the kernel's + certificate store. + + If the attribute is not present, it indicates that + authentication is unsupported by the device, or the TSM has no + available authentication methods for the device. + + When present and the tsm/ attribute directory is present, the + authenticated attribute is an alias for the device 'connect' + state. See the 'tsm/connect' attribute for more details. diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst index a38e475cdbe3..9e1b801d0f74 100644 --- a/Documentation/driver-api/pci/index.rst +++ b/Documentation/driver-api/pci/index.rst @@ -10,6 +10,7 @@ The Linux PCI driver implementer's API guide pci p2pdma + tsm .. only:: subproject and html diff --git a/Documentation/driver-api/pci/tsm.rst b/Documentation/driver-api/pci/tsm.rst new file mode 100644 index 000000000000..232b92bec93f --- /dev/null +++ b/Documentation/driver-api/pci/tsm.rst @@ -0,0 +1,21 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +======================================================== +PCI Trusted Execution Environment Security Manager (TSM) +======================================================== + +Subsystem Interfaces +==================== + +.. kernel-doc:: include/linux/pci-ide.h + :internal: + +.. kernel-doc:: drivers/pci/ide.c + :export: + +.. kernel-doc:: include/linux/pci-tsm.h + :internal: + +.. kernel-doc:: drivers/pci/tsm.c + :export: diff --git a/MAINTAINERS b/MAINTAINERS index b8c9929532ed..f1c8793bf03e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26118,8 +26118,10 @@ L: linux-coco@lists.linux.dev S: Maintained F: Documentation/ABI/testing/configfs-tsm-report F: Documentation/driver-api/coco/ +F: Documentation/driver-api/pci/tsm.rst +F: drivers/pci/tsm.c F: drivers/virt/coco/guest/ -F: include/linux/tsm*.h +F: include/linux/*tsm*.h F: samples/tsm-mr/ TRUSTED SERVICES TEE DRIVER diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index b28423e2057f..00b0210e1f1d 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -125,6 +125,21 @@ config PCI_ATS config PCI_IDE bool +config PCI_TSM + bool "PCI TSM: Device security protocol support" + select PCI_IDE + select PCI_DOE + select TSM + help + The TEE (Trusted Execution Environment) Device Interface + Security Protocol (TDISP) defines a "TSM" as a platform agent + that manages device authentication, link encryption, link + integrity protection, and assignment of PCI device functions + (virtual or physical) to confidential computing VMs that can + access (DMA) guest private memory. + + Enable a platform TSM driver to use this capability. + config PCI_DOE bool "Enable PCI Data Object Exchange (DOE) support" help diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 6612256fd37d..2c545f877062 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o obj-$(CONFIG_VGA_ARB) += vgaarb.o obj-$(CONFIG_PCI_DOE) += doe.o obj-$(CONFIG_PCI_IDE) += ide.o +obj-$(CONFIG_PCI_TSM) += tsm.o obj-$(CONFIG_PCI_DYNAMIC_OF_NODES) += of_property.o obj-$(CONFIG_PCI_NPEM) += npem.o obj-$(CONFIG_PCIE_TPH) += tph.o diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c index aae9a8a00406..62be9c8dbc52 100644 --- a/drivers/pci/doe.c +++ b/drivers/pci/doe.c @@ -24,8 +24,6 @@ #include "pci.h" -#define PCI_DOE_FEATURE_DISCOVERY 0 - /* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */ #define PCI_DOE_TIMEOUT HZ #define PCI_DOE_POLL_INTERVAL (PCI_DOE_TIMEOUT / 128) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9d6f74bd95f8..7f9237a926c2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1868,6 +1868,10 @@ const struct attribute_group *pci_dev_attr_groups[] = { #endif #ifdef CONFIG_PCI_DOE &pci_doe_sysfs_group, +#endif +#ifdef CONFIG_PCI_TSM + &pci_tsm_auth_attr_group, + &pci_tsm_attr_group, #endif NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 86ef13e7cece..6e4cc1c9aa58 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -619,6 +619,16 @@ void pci_ide_init(struct pci_dev *dev); static inline void pci_ide_init(struct pci_dev *dev) { } #endif +#ifdef CONFIG_PCI_TSM +void pci_tsm_init(struct pci_dev *pdev); +void pci_tsm_destroy(struct pci_dev *pdev); +extern const struct attribute_group pci_tsm_attr_group; +extern const struct attribute_group pci_tsm_auth_attr_group; +#else +static inline void pci_tsm_init(struct pci_dev *pdev) { } +static inline void pci_tsm_destroy(struct pci_dev *pdev) { } +#endif + /** * pci_dev_set_io_state - Set the new error state if possible. * diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 4c55020f3ddf..d1467348c169 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2763,6 +2763,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) ret = device_add(&dev->dev); WARN_ON(ret < 0); + /* Establish pdev->tsm for newly added (e.g. new SR-IOV VFs) */ + pci_tsm_init(dev); + pci_npem_create(dev); pci_doe_sysfs_init(dev); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index ce5c25adef55..803391892c4a 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -57,6 +57,12 @@ static void pci_destroy_dev(struct pci_dev *dev) pci_doe_sysfs_teardown(dev); pci_npem_remove(dev); + /* + * While device is in D0 drop the device from TSM link operations + * including unbind and disconnect (IDE + SPDM teardown). + */ + pci_tsm_destroy(dev); + device_del(&dev->dev); down_write(&pci_bus_sem); diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c new file mode 100644 index 000000000000..6a2849f77adc --- /dev/null +++ b/drivers/pci/tsm.c @@ -0,0 +1,643 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Interface with platform TEE Security Manager (TSM) objects as defined by + * PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP) + * + * Copyright(c) 2024-2025 Intel Corporation. All rights reserved. + */ + +#define dev_fmt(fmt) "PCI/TSM: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "pci.h" + +/* + * Provide a read/write lock against the init / exit of pdev tsm + * capabilities and arrival/departure of a TSM instance + */ +static DECLARE_RWSEM(pci_tsm_rwsem); + +/* + * Count of TSMs registered that support physical link operations vs device + * security state management. + */ +static int pci_tsm_link_count; +static int pci_tsm_devsec_count; + +static const struct pci_tsm_ops *to_pci_tsm_ops(struct pci_tsm *tsm) +{ + return tsm->tsm_dev->pci_ops; +} + +static inline bool is_dsm(struct pci_dev *pdev) +{ + return pdev->tsm && pdev->tsm->dsm_dev == pdev; +} + +static inline bool has_tee(struct pci_dev *pdev) +{ + return pdev->devcap & PCI_EXP_DEVCAP_TEE; +} + +/* 'struct pci_tsm_pf0' wraps 'struct pci_tsm' when ->dsm_dev == ->pdev (self) */ +static struct pci_tsm_pf0 *to_pci_tsm_pf0(struct pci_tsm *tsm) +{ + /* + * All "link" TSM contexts reference the device that hosts the DSM + * interface for a set of devices. Walk to the DSM device and cast its + * ->tsm context to a 'struct pci_tsm_pf0 *'. + */ + struct pci_dev *pf0 = tsm->dsm_dev; + + if (!is_pci_tsm_pf0(pf0) || !is_dsm(pf0)) { + pci_WARN_ONCE(tsm->pdev, 1, "invalid context object\n"); + return NULL; + } + + return container_of(pf0->tsm, struct pci_tsm_pf0, base_tsm); +} + +static void tsm_remove(struct pci_tsm *tsm) +{ + struct pci_dev *pdev; + + if (!tsm) + return; + + pdev = tsm->pdev; + to_pci_tsm_ops(tsm)->remove(tsm); + pdev->tsm = NULL; +} +DEFINE_FREE(tsm_remove, struct pci_tsm *, if (_T) tsm_remove(_T)) + +static void pci_tsm_walk_fns(struct pci_dev *pdev, + int (*cb)(struct pci_dev *pdev, void *data), + void *data) +{ + /* Walk subordinate physical functions */ + for (int i = 0; i < 8; i++) { + struct pci_dev *pf __free(pci_dev_put) = pci_get_slot( + pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i)); + + if (!pf) + continue; + + /* on entry function 0 has already run @cb */ + if (i > 0) + cb(pf, data); + + /* walk virtual functions of each pf */ + for (int j = 0; j < pci_num_vf(pf); j++) { + struct pci_dev *vf __free(pci_dev_put) = + pci_get_domain_bus_and_slot( + pci_domain_nr(pf->bus), + pci_iov_virtfn_bus(pf, j), + pci_iov_virtfn_devfn(pf, j)); + + if (!vf) + continue; + + cb(vf, data); + } + } + + /* + * Walk downstream devices, assumes that an upstream DSM is + * limited to downstream physical functions + */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev)) + pci_walk_bus(pdev->subordinate, cb, data); +} + +static void pci_tsm_walk_fns_reverse(struct pci_dev *pdev, + int (*cb)(struct pci_dev *pdev, + void *data), + void *data) +{ + /* Reverse walk downstream devices */ + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM && is_dsm(pdev)) + pci_walk_bus_reverse(pdev->subordinate, cb, data); + + /* Reverse walk subordinate physical functions */ + for (int i = 7; i >= 0; i--) { + struct pci_dev *pf __free(pci_dev_put) = pci_get_slot( + pdev->bus, PCI_DEVFN(PCI_SLOT(pdev->devfn), i)); + + if (!pf) + continue; + + /* reverse walk virtual functions */ + for (int j = pci_num_vf(pf) - 1; j >= 0; j--) { + struct pci_dev *vf __free(pci_dev_put) = + pci_get_domain_bus_and_slot( + pci_domain_nr(pf->bus), + pci_iov_virtfn_bus(pf, j), + pci_iov_virtfn_devfn(pf, j)); + + if (!vf) + continue; + cb(vf, data); + } + + /* on exit, caller will run @cb on function 0 */ + if (i > 0) + cb(pf, data); + } +} + +static int probe_fn(struct pci_dev *pdev, void *dsm) +{ + struct pci_dev *dsm_dev = dsm; + const struct pci_tsm_ops *ops = to_pci_tsm_ops(dsm_dev->tsm); + + pdev->tsm = ops->probe(dsm_dev->tsm->tsm_dev, pdev); + pci_dbg(pdev, "setup TSM context: DSM: %s status: %s\n", + pci_name(dsm_dev), pdev->tsm ? "success" : "failed"); + return 0; +} + +static int pci_tsm_connect(struct pci_dev *pdev, struct tsm_dev *tsm_dev) +{ + int rc; + struct pci_tsm_pf0 *tsm_pf0; + const struct pci_tsm_ops *ops = tsm_dev->pci_ops; + struct pci_tsm *pci_tsm __free(tsm_remove) = ops->probe(tsm_dev, pdev); + + /* connect() mutually exclusive with subfunction pci_tsm_init() */ + lockdep_assert_held_write(&pci_tsm_rwsem); + + if (!pci_tsm) + return -ENXIO; + + pdev->tsm = pci_tsm; + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + + /* mutex_intr assumes connect() is always sysfs/user driven */ + ACQUIRE(mutex_intr, lock)(&tsm_pf0->lock); + if ((rc = ACQUIRE_ERR(mutex_intr, &lock))) + return rc; + + rc = ops->connect(pdev); + if (rc) + return rc; + + pdev->tsm = no_free_ptr(pci_tsm); + + /* + * Now that the DSM is established, probe() all the potential + * dependent functions. Failure to probe a function is not fatal + * to connect(), it just disables subsequent security operations + * for that function. + * + * Note this is done unconditionally, without regard to finding + * PCI_EXP_DEVCAP_TEE on the dependent function, for robustness. The DSM + * is the ultimate arbiter of security state relative to a given + * interface id, and if it says it can manage TDISP state of a function, + * let it. + */ + if (has_tee(pdev)) + pci_tsm_walk_fns(pdev, probe_fn, pdev); + return 0; +} + +static ssize_t connect_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct tsm_dev *tsm_dev; + int rc; + + ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock))) + return rc; + + if (!pdev->tsm) + return sysfs_emit(buf, "\n"); + + tsm_dev = pdev->tsm->tsm_dev; + return sysfs_emit(buf, "%s\n", dev_name(&tsm_dev->dev)); +} + +/* Is @tsm_dev managing physical link / session properties... */ +static bool is_link_tsm(struct tsm_dev *tsm_dev) +{ + return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->link_ops.probe; +} + +/* ...or is @tsm_dev managing device security state ? */ +static bool is_devsec_tsm(struct tsm_dev *tsm_dev) +{ + return tsm_dev && tsm_dev->pci_ops && tsm_dev->pci_ops->devsec_ops.lock; +} + +static ssize_t connect_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int rc, id; + + rc = sscanf(buf, "tsm%d\n", &id); + if (rc != 1) + return -EINVAL; + + ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock))) + return rc; + + if (pdev->tsm) + return -EBUSY; + + struct tsm_dev *tsm_dev __free(put_tsm_dev) = find_tsm_dev(id); + if (!is_link_tsm(tsm_dev)) + return -ENXIO; + + rc = pci_tsm_connect(pdev, tsm_dev); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(connect); + +static int remove_fn(struct pci_dev *pdev, void *data) +{ + tsm_remove(pdev->tsm); + return 0; +} + +static void __pci_tsm_disconnect(struct pci_dev *pdev) +{ + struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + const struct pci_tsm_ops *ops = to_pci_tsm_ops(pdev->tsm); + + /* disconnect() mutually exclusive with subfunction pci_tsm_init() */ + lockdep_assert_held_write(&pci_tsm_rwsem); + + /* + * disconnect() is uninterruptible as it may be called for device + * teardown + */ + guard(mutex)(&tsm_pf0->lock); + pci_tsm_walk_fns_reverse(pdev, remove_fn, NULL); + ops->disconnect(pdev); +} + +static void pci_tsm_disconnect(struct pci_dev *pdev) +{ + __pci_tsm_disconnect(pdev); + tsm_remove(pdev->tsm); +} + +static ssize_t disconnect_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t len) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct tsm_dev *tsm_dev; + int rc; + + ACQUIRE(rwsem_write_kill, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &lock))) + return rc; + + if (!pdev->tsm) + return -ENXIO; + + tsm_dev = pdev->tsm->tsm_dev; + if (!sysfs_streq(buf, dev_name(&tsm_dev->dev))) + return -EINVAL; + + pci_tsm_disconnect(pdev); + return len; +} +static DEVICE_ATTR_WO(disconnect); + +/* The 'authenticated' attribute is exclusive to the presence of a 'link' TSM */ +static bool pci_tsm_link_group_visible(struct kobject *kobj) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + return pci_tsm_link_count && is_pci_tsm_pf0(pdev); +} +DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(pci_tsm_link); + +/* + * 'link' and 'devsec' TSMs share the same 'tsm/' sysfs group, so the TSM type + * specific attributes need individual visibility checks. + */ +static umode_t pci_tsm_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + if (pci_tsm_link_group_visible(kobj)) { + if (attr == &dev_attr_connect.attr || + attr == &dev_attr_disconnect.attr) + return attr->mode; + } + + return 0; +} + +static bool pci_tsm_group_visible(struct kobject *kobj) +{ + return pci_tsm_link_group_visible(kobj); +} +DEFINE_SYSFS_GROUP_VISIBLE(pci_tsm); + +static struct attribute *pci_tsm_attrs[] = { + &dev_attr_connect.attr, + &dev_attr_disconnect.attr, + NULL +}; + +const struct attribute_group pci_tsm_attr_group = { + .name = "tsm", + .attrs = pci_tsm_attrs, + .is_visible = SYSFS_GROUP_VISIBLE(pci_tsm), +}; + +static ssize_t authenticated_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + /* + * When the SPDM session established via TSM the 'authenticated' state + * of the device is identical to the connect state. + */ + return connect_show(dev, attr, buf); +} +static DEVICE_ATTR_RO(authenticated); + +static struct attribute *pci_tsm_auth_attrs[] = { + &dev_attr_authenticated.attr, + NULL +}; + +const struct attribute_group pci_tsm_auth_attr_group = { + .attrs = pci_tsm_auth_attrs, + .is_visible = SYSFS_GROUP_VISIBLE(pci_tsm_link), +}; + +/* + * Retrieve physical function0 device whether it has TEE capability or not + */ +static struct pci_dev *pf0_dev_get(struct pci_dev *pdev) +{ + struct pci_dev *pf_dev = pci_physfn(pdev); + + if (PCI_FUNC(pf_dev->devfn) == 0) + return pci_dev_get(pf_dev); + + return pci_get_slot(pf_dev->bus, + pf_dev->devfn - PCI_FUNC(pf_dev->devfn)); +} + +/* + * Find the PCI Device instance that serves as the Device Security Manager (DSM) + * for @pdev. Note that no additional reference is held for the resulting device + * because that resulting object always has a registered lifetime + * greater-than-or-equal to that of the @pdev argument. This is by virtue of + * @pdev being a descendant of, or identical to, the returned DSM device. + */ +static struct pci_dev *find_dsm_dev(struct pci_dev *pdev) +{ + struct device *grandparent; + struct pci_dev *uport; + + if (is_pci_tsm_pf0(pdev)) + return pdev; + + struct pci_dev *pf0 __free(pci_dev_put) = pf0_dev_get(pdev); + if (!pf0) + return NULL; + + if (is_dsm(pf0)) + return pf0; + + /* + * For cases where a switch may be hosting TDISP services on behalf of + * downstream devices, check the first upstream port relative to this + * endpoint. + */ + if (!pdev->dev.parent) + return NULL; + grandparent = pdev->dev.parent->parent; + if (!grandparent) + return NULL; + if (!dev_is_pci(grandparent)) + return NULL; + uport = to_pci_dev(grandparent); + if (!pci_is_pcie(uport) || + pci_pcie_type(uport) != PCI_EXP_TYPE_UPSTREAM) + return NULL; + + if (is_dsm(uport)) + return uport; + return NULL; +} + +/** + * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs + * @pdev: The PCI device + * @tsm: context to initialize + * @tsm_dev: Platform TEE Security Manager, initiator of security operations + */ +int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm, + struct tsm_dev *tsm_dev) +{ + if (!is_link_tsm(tsm_dev)) + return -EINVAL; + + tsm->dsm_dev = find_dsm_dev(pdev); + if (!tsm->dsm_dev) { + pci_warn(pdev, "failed to find Device Security Manager\n"); + return -ENXIO; + } + tsm->pdev = pdev; + tsm->tsm_dev = tsm_dev; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_tsm_link_constructor); + +/** + * pci_tsm_pf0_constructor() - common 'struct pci_tsm_pf0' (DSM) initialization + * @pdev: Physical Function 0 PCI device (as indicated by is_pci_tsm_pf0()) + * @tsm: context to initialize + * @tsm_dev: Platform TEE Security Manager, initiator of security operations + */ +int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, + struct tsm_dev *tsm_dev) +{ + mutex_init(&tsm->lock); + tsm->doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_PCI_SIG, + PCI_DOE_FEATURE_CMA); + if (!tsm->doe_mb) { + pci_warn(pdev, "TSM init failure, no CMA mailbox\n"); + return -ENODEV; + } + + return pci_tsm_link_constructor(pdev, &tsm->base_tsm, tsm_dev); +} +EXPORT_SYMBOL_GPL(pci_tsm_pf0_constructor); + +void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *pf0_tsm) +{ + mutex_destroy(&pf0_tsm->lock); +} +EXPORT_SYMBOL_GPL(pci_tsm_pf0_destructor); + +static void pf0_sysfs_enable(struct pci_dev *pdev) +{ + bool tee = has_tee(pdev); + + pci_dbg(pdev, "Device Security Manager detected (%s%s%s)\n", + pdev->ide_cap ? "IDE" : "", pdev->ide_cap && tee ? " " : "", + tee ? "TEE" : ""); + + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group); + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group); +} + +int pci_tsm_register(struct tsm_dev *tsm_dev) +{ + struct pci_dev *pdev = NULL; + + if (!tsm_dev) + return -EINVAL; + + /* The TSM device must only implement one of link_ops or devsec_ops */ + if (!is_link_tsm(tsm_dev) && !is_devsec_tsm(tsm_dev)) + return -EINVAL; + + if (is_link_tsm(tsm_dev) && is_devsec_tsm(tsm_dev)) + return -EINVAL; + + guard(rwsem_write)(&pci_tsm_rwsem); + + /* On first enable, update sysfs groups */ + if (is_link_tsm(tsm_dev) && pci_tsm_link_count++ == 0) { + for_each_pci_dev(pdev) + if (is_pci_tsm_pf0(pdev)) + pf0_sysfs_enable(pdev); + } else if (is_devsec_tsm(tsm_dev)) { + pci_tsm_devsec_count++; + } + + return 0; +} + +static void pci_tsm_fn_exit(struct pci_dev *pdev) +{ + /* TODO: unbind the fn */ + tsm_remove(pdev->tsm); +} + +/** + * __pci_tsm_destroy() - destroy the TSM context for @pdev + * @pdev: device to cleanup + * @tsm_dev: the TSM device being removed, or NULL if @pdev is being removed. + * + * At device removal or TSM unregistration all established context + * with the TSM is torn down. Additionally, if there are no more TSMs + * registered, the PCI tsm/ sysfs attributes are hidden. + */ +static void __pci_tsm_destroy(struct pci_dev *pdev, struct tsm_dev *tsm_dev) +{ + struct pci_tsm *tsm = pdev->tsm; + + lockdep_assert_held_write(&pci_tsm_rwsem); + + /* + * First, handle the TSM removal case to shutdown @pdev sysfs, this is + * skipped if the device itself is being removed since sysfs goes away + * naturally at that point + */ + if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev) && !pci_tsm_link_count) { + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_auth_attr_group); + sysfs_update_group(&pdev->dev.kobj, &pci_tsm_attr_group); + } + + /* Nothing else to do if this device never attached to the departing TSM */ + if (!tsm) + return; + + /* Now lookup the tsm_dev to destroy TSM context */ + if (!tsm_dev) + tsm_dev = tsm->tsm_dev; + else if (tsm_dev != tsm->tsm_dev) + return; + + if (is_link_tsm(tsm_dev) && is_pci_tsm_pf0(pdev)) + pci_tsm_disconnect(pdev); + else + pci_tsm_fn_exit(pdev); +} + +void pci_tsm_destroy(struct pci_dev *pdev) +{ + guard(rwsem_write)(&pci_tsm_rwsem); + __pci_tsm_destroy(pdev, NULL); +} + +void pci_tsm_init(struct pci_dev *pdev) +{ + guard(rwsem_read)(&pci_tsm_rwsem); + + /* + * Subfunctions are either probed synchronous with connect() or later + * when either the SR-IOV configuration is changed, or, unlikely, + * connect() raced initial bus scanning. + */ + if (pdev->tsm) + return; + + if (pci_tsm_link_count) { + struct pci_dev *dsm = find_dsm_dev(pdev); + + if (!dsm) + return; + + /* + * The only path to init a Device Security Manager capable + * device is via connect(). + */ + if (!dsm->tsm) + return; + + probe_fn(pdev, dsm); + } +} + +void pci_tsm_unregister(struct tsm_dev *tsm_dev) +{ + struct pci_dev *pdev = NULL; + + guard(rwsem_write)(&pci_tsm_rwsem); + if (is_link_tsm(tsm_dev)) + pci_tsm_link_count--; + if (is_devsec_tsm(tsm_dev)) + pci_tsm_devsec_count--; + for_each_pci_dev_reverse(pdev) + __pci_tsm_destroy(pdev, tsm_dev); +} + +int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, + size_t req_sz, void *resp, size_t resp_sz) +{ + struct pci_tsm_pf0 *tsm; + + if (!pdev->tsm || !is_pci_tsm_pf0(pdev)) + return -ENXIO; + + tsm = to_pci_tsm_pf0(pdev->tsm); + if (!tsm->doe_mb) + return -ENXIO; + + return pci_doe(tsm->doe_mb, PCI_VENDOR_ID_PCI_SIG, type, req, req_sz, + resp, resp_sz); +} +EXPORT_SYMBOL_GPL(pci_tsm_doe_transfer); diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c index 347507cc5e3f..0e705f3067a1 100644 --- a/drivers/virt/coco/tsm-core.c +++ b/drivers/virt/coco/tsm-core.c @@ -8,11 +8,29 @@ #include #include #include +#include static struct class *tsm_class; static DECLARE_RWSEM(tsm_rwsem); static DEFINE_IDA(tsm_ida); +static int match_id(struct device *dev, const void *data) +{ + struct tsm_dev *tsm_dev = container_of(dev, struct tsm_dev, dev); + int id = *(const int *)data; + + return tsm_dev->id == id; +} + +struct tsm_dev *find_tsm_dev(int id) +{ + struct device *dev = class_find_device(tsm_class, NULL, &id, match_id); + + if (!dev) + return NULL; + return container_of(dev, struct tsm_dev, dev); +} + static struct tsm_dev *alloc_tsm_dev(struct device *parent) { struct device *dev; @@ -36,7 +54,29 @@ static struct tsm_dev *alloc_tsm_dev(struct device *parent) return no_free_ptr(tsm_dev); } -struct tsm_dev *tsm_register(struct device *parent) +static struct tsm_dev *tsm_register_pci_or_reset(struct tsm_dev *tsm_dev, + struct pci_tsm_ops *pci_ops) +{ + int rc; + + if (!pci_ops) + return tsm_dev; + + tsm_dev->pci_ops = pci_ops; + rc = pci_tsm_register(tsm_dev); + if (rc) { + dev_err(tsm_dev->dev.parent, + "PCI/TSM registration failure: %d\n", rc); + device_unregister(&tsm_dev->dev); + return ERR_PTR(rc); + } + + /* Notify TSM userspace that PCI/TSM operations are now possible */ + kobject_uevent(&tsm_dev->dev.kobj, KOBJ_CHANGE); + return tsm_dev; +} + +struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *pci_ops) { struct tsm_dev *tsm_dev __free(put_tsm_dev) = alloc_tsm_dev(parent); struct device *dev; @@ -54,12 +94,14 @@ struct tsm_dev *tsm_register(struct device *parent) if (rc) return ERR_PTR(rc); - return no_free_ptr(tsm_dev); + return tsm_register_pci_or_reset(no_free_ptr(tsm_dev), pci_ops); } EXPORT_SYMBOL_GPL(tsm_register); void tsm_unregister(struct tsm_dev *tsm_dev) { + if (tsm_dev->pci_ops) + pci_tsm_unregister(tsm_dev); device_unregister(&tsm_dev->dev); } EXPORT_SYMBOL_GPL(tsm_unregister); diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h index 1f14aed4354b..bd4346a7c4e7 100644 --- a/include/linux/pci-doe.h +++ b/include/linux/pci-doe.h @@ -15,6 +15,10 @@ struct pci_doe_mb; +#define PCI_DOE_FEATURE_DISCOVERY 0 +#define PCI_DOE_FEATURE_CMA 1 +#define PCI_DOE_FEATURE_SSESSION 2 + struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor, u8 type); diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h new file mode 100644 index 000000000000..e921d30f9b6c --- /dev/null +++ b/include/linux/pci-tsm.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PCI_TSM_H +#define __PCI_TSM_H +#include +#include + +struct pci_tsm; +struct tsm_dev; + +/* + * struct pci_tsm_ops - manage confidential links and security state + * @link_ops: Coordinate PCIe SPDM and IDE establishment via a platform TSM. + * Provide a secure session transport for TDISP state management + * (typically bare metal physical function operations). + * @devsec_ops: Lock, unlock, and interrogate the security state of the + * function via the platform TSM (typically virtual function + * operations). + * + * This operations are mutually exclusive either a tsm_dev instance + * manages physical link properties or it manages function security + * states like TDISP lock/unlock. + */ +struct pci_tsm_ops { + /* + * struct pci_tsm_link_ops - Manage physical link and the TSM/DSM session + * @probe: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on link operations + * @remove: destroy link operations context + * @connect: establish / validate a secure connection (e.g. IDE) + * with the device + * @disconnect: teardown the secure link + * + * Context: @probe, @remove, @connect, and @disconnect run under + * pci_tsm_rwsem held for write to sync with TSM unregistration and + * mutual exclusion of @connect and @disconnect. @connect and + * @disconnect additionally run under the DSM lock (struct + * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + */ + struct_group_tagged(pci_tsm_link_ops, link_ops, + struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*remove)(struct pci_tsm *tsm); + int (*connect)(struct pci_dev *pdev); + void (*disconnect)(struct pci_dev *pdev); + ); + + /* + * struct pci_tsm_devsec_ops - Manage the security state of the function + * @lock: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on security state transitions from the + * LOCKED state + * @unlock: destroy TSM context and return device to UNLOCKED state + * + * Context: @lock and @unlock run under pci_tsm_rwsem held for write to + * sync with TSM unregistration and each other + */ + struct_group_tagged(pci_tsm_devsec_ops, devsec_ops, + struct pci_tsm *(*lock)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*unlock)(struct pci_tsm *tsm); + ); +}; + +/** + * struct pci_tsm - Core TSM context for a given PCIe endpoint + * @pdev: Back ref to device function, distinguishes type of pci_tsm context + * @dsm_dev: PCI Device Security Manager for link operations on @pdev + * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device + * Function Security operations + * + * This structure is wrapped by low level TSM driver data and returned by + * probe()/lock(), it is freed by the corresponding remove()/unlock(). + * + * For link operations it serves to cache the association between a Device + * Security Manager (DSM) and the functions that manager can assign to a TVM. + * That can be "self", for assigning function0 of a TEE I/O device, a + * sub-function (SR-IOV virtual function, or non-function0 + * multifunction-device), or a downstream endpoint (PCIe upstream switch-port as + * DSM). + */ +struct pci_tsm { + struct pci_dev *pdev; + struct pci_dev *dsm_dev; + struct tsm_dev *tsm_dev; +}; + +/** + * struct pci_tsm_pf0 - Physical Function 0 TDISP link context + * @base_tsm: generic core "tsm" context + * @lock: mutual exclustion for pci_tsm_ops invocation + * @doe_mb: PCIe Data Object Exchange mailbox + */ +struct pci_tsm_pf0 { + struct pci_tsm base_tsm; + struct mutex lock; + struct pci_doe_mb *doe_mb; +}; + +/* physical function0 and capable of 'connect' */ +static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) +{ + if (!pdev) + return false; + + if (!pci_is_pcie(pdev)) + return false; + + if (pdev->is_virtfn) + return false; + + /* + * Allow for a Device Security Manager (DSM) associated with function0 + * of an Endpoint to coordinate TDISP requests for other functions + * (physical or virtual) of the device, or allow for an Upstream Port + * DSM to accept TDISP requests for the Endpoints downstream of the + * switch. + */ + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_RC_END: + if (pdev->ide_cap || (pdev->devcap & PCI_EXP_DEVCAP_TEE)) + break; + fallthrough; + default: + return false; + } + + return PCI_FUNC(pdev->devfn) == 0; +} + +#ifdef CONFIG_PCI_TSM +int pci_tsm_register(struct tsm_dev *tsm_dev); +void pci_tsm_unregister(struct tsm_dev *tsm_dev); +int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm, + struct tsm_dev *tsm_dev); +int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, + struct tsm_dev *tsm_dev); +void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); +int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, + size_t req_sz, void *resp, size_t resp_sz); +#else +static inline int pci_tsm_register(struct tsm_dev *tsm_dev) +{ + return 0; +} +static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) +{ +} +static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, + const void *req, size_t req_sz, + void *resp, size_t resp_sz) +{ + return -ENXIO; +} +#endif +#endif /*__PCI_TSM_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index b6a12a82be12..2f9c0cb6a50a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -546,6 +546,9 @@ struct pci_dev { u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ +#endif +#ifdef CONFIG_PCI_TSM + struct pci_tsm *tsm; /* TSM operation state */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/linux/tsm.h b/include/linux/tsm.h index cd97c63ffa32..22e05b2aac69 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -108,9 +108,11 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct pci_tsm_ops; struct tsm_dev { struct device dev; int id; + const struct pci_tsm_ops *pci_ops; }; DEFINE_FREE(put_tsm_dev, struct tsm_dev *, @@ -118,6 +120,7 @@ DEFINE_FREE(put_tsm_dev, struct tsm_dev *, int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); -struct tsm_dev *tsm_register(struct device *parent); +struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); +struct tsm_dev *find_tsm_dev(int id); #endif /* __TSM_H */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 05bd22d9e352..f2759c1097bc 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -503,6 +503,7 @@ #define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */ #define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */ #define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */ +#define PCI_EXP_DEVCAP_TEE 0x40000000 /* TEE I/O (TDISP) Support */ #define PCI_EXP_DEVCTL 0x08 /* Device Control */ #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ #define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ -- cgit v1.2.3 From c0c1262fbfbafe943dbccd5f97b500b72dbd2205 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:57 -0700 Subject: PCI: Add PCIe Device 3 Extended Capability enumeration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0 Section 7.7.9 Device 3 Extended Capability Structure, defines the canonical location for determining the Flit Mode of a device. This status is a dependency for PCIe IDE enabling. Add a new fm_enabled flag to 'struct pci_dev'. Cc: Lukas Wunner Cc: Ilpo Järvinen Cc: Bjorn Helgaas Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/probe.c | 12 ++++++++++++ include/linux/pci.h | 1 + include/uapi/linux/pci_regs.h | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index d1467348c169..3b54f1720be5 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2283,6 +2283,17 @@ int pci_configure_extended_tags(struct pci_dev *dev, void *ign) return 0; } +static void pci_dev3_init(struct pci_dev *pdev) +{ + u16 cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DEV3); + u32 val = 0; + + if (!cap) + return; + pci_read_config_dword(pdev, cap + PCI_DEV3_STA, &val); + pdev->fm_enabled = !!(val & PCI_DEV3_STA_SEGMENT); +} + /** * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable * @dev: PCI device to query @@ -2667,6 +2678,7 @@ static void pci_init_capabilities(struct pci_dev *dev) pci_doe_init(dev); /* Data Object Exchange */ pci_tph_init(dev); /* TLP Processing Hints */ pci_rebar_init(dev); /* Resizable BAR */ + pci_dev3_init(dev); /* Device 3 capabilities */ pci_ide_init(dev); /* Link Integrity and Data Encryption */ pcie_report_downtraining(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 2f9c0cb6a50a..ea94799c81b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -450,6 +450,7 @@ struct pci_dev { unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ unsigned int tph_enabled:1; /* TLP Processing Hints */ + unsigned int fm_enabled:1; /* Flit Mode (segment captured) */ unsigned int is_managed:1; /* Managed via devres */ unsigned int is_msi_managed:1; /* MSI release via devres installed */ unsigned int needs_freset:1; /* Requires fundamental reset */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f2759c1097bc..3add74ae2594 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -755,6 +755,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_DEV3 0x2F /* Device 3 Capability/Control/Status */ #define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1246,6 +1247,12 @@ /* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */ #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE +/* Device 3 Extended Capability */ +#define PCI_DEV3_CAP 0x04 /* Device 3 Capabilities Register */ +#define PCI_DEV3_CTL 0x08 /* Device 3 Control Register */ +#define PCI_DEV3_STA 0x0c /* Device 3 Status Register */ +#define PCI_DEV3_STA_SEGMENT 0x8 /* Segment Captured (end-to-end flit-mode detected) */ + /* Compute Express Link (CXL r3.1, sec 8.1.5) */ #define PCI_DVSEC_CXL_PORT 3 #define PCI_DVSEC_CXL_PORT_CTL 0x0c -- cgit v1.2.3 From 1e4d2ff3ae450dab37b5b5726c3f7df3e60d6e89 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:59 -0700 Subject: PCI/IDE: Add IDE establishment helpers There are two components to establishing an encrypted link, provisioning the stream in Partner Port config-space, and programming the keys into the link layer via IDE_KM (IDE Key Management). This new library, drivers/pci/ide.c, enables the former. IDE_KM, via a TSM low-level driver, is saved for later. With the platform TSM implementations of SEV-TIO and TDX Connect in mind this library abstracts small differences in those implementations. For example, TDX Connect handles Root Port register setup while SEV-TIO expects System Software to update the Root Port registers. This is the rationale for fine-grained 'setup' + 'enable' verbs. The other design detail for TSM-coordinated IDE establishment is that the TSM may manage allocation of Stream IDs, this is why the Stream ID value is passed in to pci_ide_stream_setup(). The flow is: pci_ide_stream_alloc(): Allocate a Selective IDE Stream Register Block in each Partner Port (Endpoint + Root Port), and reserve a host bridge / platform stream slot. Gather Partner Port specific stream settings like Requester ID. pci_ide_stream_register(): Publish the stream in sysfs after allocating a Stream ID. In the TSM case the TSM allocates the Stream ID for the Partner Port pair. pci_ide_stream_setup(): Program the stream settings to a Partner Port. Caller is responsible for optionally calling this for the Root Port as well if the TSM implementation requires it. pci_ide_stream_enable(): Enable the stream after IDE_KM. In support of system administrators auditing where platform, Root Port, and Endpoint IDE stream resources are being spent, the allocated stream is reflected as a symlink from the host bridge to the endpoint with the name: stream%d.%d.%d Where the tuple of integers reflects the allocated platform, Root Port, and Endpoint stream index (Selective IDE Stream Register Block) values. Thanks to Wu Hao for a draft implementation of this infrastructure. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- .../ABI/testing/sysfs-devices-pci-host-bridge | 14 + drivers/pci/ide.c | 428 +++++++++++++++++++++ drivers/pci/pci.h | 2 + drivers/pci/probe.c | 1 + include/linux/pci-ide.h | 78 ++++ include/linux/pci.h | 6 + 6 files changed, 529 insertions(+) create mode 100644 include/linux/pci-ide.h (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge index 8c3a652799f1..2c66e5bb2bf8 100644 --- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge +++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge @@ -17,3 +17,17 @@ Description: PNP0A08 (/sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A08:00). See /sys/devices/pciDDDD:BB entry for details about the DDDD:BB format. + +What: pciDDDD:BB/streamH.R.E +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a platform has established a secure connection, PCIe + IDE, between two Partner Ports, this symlink appears. A stream + consumes a Stream ID slot in each of the Host bridge (H), Root + Port (R) and Endpoint (E). The link points to the Endpoint PCI + device in the Selective IDE Stream pairing. Specifically, "R" + and "E" represent the assigned Selective IDE Stream Register + Block in the Root Port and Endpoint, and "H" represents a + platform specific pool of stream resources shared by the Root + Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for + details about the DDDD:BB format. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 26866edf91b4..7643840738fe 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -5,8 +5,12 @@ #define dev_fmt(fmt) "PCI/IDE: " fmt #include +#include #include +#include #include +#include +#include #include "pci.h" @@ -23,12 +27,25 @@ static int __sel_ide_offset(u16 ide_cap, u8 nr_link_ide, u8 stream_index, return offset + stream_index * PCI_IDE_SEL_BLOCK_SIZE(nr_ide_mem); } +static int sel_ide_offset(struct pci_dev *pdev, + struct pci_ide_partner *settings) +{ + return __sel_ide_offset(pdev->ide_cap, pdev->nr_link_ide, + settings->stream_index, pdev->nr_ide_mem); +} + void pci_ide_init(struct pci_dev *pdev) { u16 nr_link_ide, nr_ide_mem, nr_streams; u16 ide_cap; u32 val; + /* + * Unconditionally init so that ida idle state is consistent with + * pdev->ide_cap. + */ + ida_init(&pdev->ide_stream_ida); + if (!pci_is_pcie(pdev)) return; @@ -84,5 +101,416 @@ void pci_ide_init(struct pci_dev *pdev) pdev->ide_cap = ide_cap; pdev->nr_link_ide = nr_link_ide; + pdev->nr_sel_ide = nr_streams; pdev->nr_ide_mem = nr_ide_mem; } + +struct stream_index { + struct ida *ida; + u8 stream_index; +}; + +static void free_stream_index(struct stream_index *stream) +{ + ida_free(stream->ida, stream->stream_index); +} + +DEFINE_FREE(free_stream, struct stream_index *, if (_T) free_stream_index(_T)) +static struct stream_index *alloc_stream_index(struct ida *ida, u16 max, + struct stream_index *stream) +{ + int id; + + if (!max) + return NULL; + + id = ida_alloc_max(ida, max - 1, GFP_KERNEL); + if (id < 0) + return NULL; + + *stream = (struct stream_index) { + .ida = ida, + .stream_index = id, + }; + return stream; +} + +/** + * pci_ide_stream_alloc() - Reserve stream indices and probe for settings + * @pdev: IDE capable PCIe Endpoint Physical Function + * + * Retrieve the Requester ID range of @pdev for programming its Root + * Port IDE RID Association registers, and conversely retrieve the + * Requester ID of the Root Port for programming @pdev's IDE RID + * Association registers. + * + * Allocate a Selective IDE Stream Register Block instance per port. + * + * Allocate a platform stream resource from the associated host bridge. + * Retrieve stream association parameters for Requester ID range and + * address range restrictions for the stream. + */ +struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) +{ + /* EP, RP, + HB Stream allocation */ + struct stream_index __stream[PCI_IDE_HB + 1]; + struct pci_host_bridge *hb; + struct pci_dev *rp; + int num_vf, rid_end; + + if (!pci_is_pcie(pdev)) + return NULL; + + if (pci_pcie_type(pdev) != PCI_EXP_TYPE_ENDPOINT) + return NULL; + + if (!pdev->ide_cap) + return NULL; + + struct pci_ide *ide __free(kfree) = kzalloc(sizeof(*ide), GFP_KERNEL); + if (!ide) + return NULL; + + hb = pci_find_host_bridge(pdev->bus); + struct stream_index *hb_stream __free(free_stream) = alloc_stream_index( + &hb->ide_stream_ida, hb->nr_ide_streams, &__stream[PCI_IDE_HB]); + if (!hb_stream) + return NULL; + + rp = pcie_find_root_port(pdev); + struct stream_index *rp_stream __free(free_stream) = alloc_stream_index( + &rp->ide_stream_ida, rp->nr_sel_ide, &__stream[PCI_IDE_RP]); + if (!rp_stream) + return NULL; + + struct stream_index *ep_stream __free(free_stream) = alloc_stream_index( + &pdev->ide_stream_ida, pdev->nr_sel_ide, &__stream[PCI_IDE_EP]); + if (!ep_stream) + return NULL; + + /* for SR-IOV case, cover all VFs */ + num_vf = pci_num_vf(pdev); + if (num_vf) + rid_end = PCI_DEVID(pci_iov_virtfn_bus(pdev, num_vf), + pci_iov_virtfn_devfn(pdev, num_vf)); + else + rid_end = pci_dev_id(pdev); + + *ide = (struct pci_ide) { + .pdev = pdev, + .partner = { + [PCI_IDE_EP] = { + .rid_start = pci_dev_id(rp), + .rid_end = pci_dev_id(rp), + .stream_index = no_free_ptr(ep_stream)->stream_index, + }, + [PCI_IDE_RP] = { + .rid_start = pci_dev_id(pdev), + .rid_end = rid_end, + .stream_index = no_free_ptr(rp_stream)->stream_index, + }, + }, + .host_bridge_stream = no_free_ptr(hb_stream)->stream_index, + .stream_id = -1, + }; + + return_ptr(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_alloc); + +/** + * pci_ide_stream_free() - unwind pci_ide_stream_alloc() + * @ide: idle IDE settings descriptor + * + * Free all of the stream index (register block) allocations acquired by + * pci_ide_stream_alloc(). The stream represented by @ide is assumed to + * be unregistered and not instantiated in any device. + */ +void pci_ide_stream_free(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_dev *rp = pcie_find_root_port(pdev); + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + + ida_free(&pdev->ide_stream_ida, ide->partner[PCI_IDE_EP].stream_index); + ida_free(&rp->ide_stream_ida, ide->partner[PCI_IDE_RP].stream_index); + ida_free(&hb->ide_stream_ida, ide->host_bridge_stream); + kfree(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_free); + +/** + * pci_ide_stream_release() - unwind and release an @ide context + * @ide: partially or fully registered IDE settings descriptor + * + * In support of automatic cleanup of IDE setup routines perform IDE + * teardown in expected reverse order of setup and with respect to which + * aspects of IDE setup have successfully completed. + * + * Be careful that setup order mirrors this shutdown order. Otherwise, + * open code releasing the IDE context. + */ +void pci_ide_stream_release(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_dev *rp = pcie_find_root_port(pdev); + + if (ide->partner[PCI_IDE_RP].enable) + pci_ide_stream_disable(rp, ide); + + if (ide->partner[PCI_IDE_EP].enable) + pci_ide_stream_disable(pdev, ide); + + if (ide->partner[PCI_IDE_RP].setup) + pci_ide_stream_teardown(rp, ide); + + if (ide->partner[PCI_IDE_EP].setup) + pci_ide_stream_teardown(pdev, ide); + + if (ide->name) + pci_ide_stream_unregister(ide); + + pci_ide_stream_free(ide); +} +EXPORT_SYMBOL_GPL(pci_ide_stream_release); + +/** + * pci_ide_stream_register() - Prepare to activate an IDE Stream + * @ide: IDE settings descriptor + * + * After a Stream ID has been acquired for @ide, record the presence of + * the stream in sysfs. The expectation is that @ide is immutable while + * registered. + */ +int pci_ide_stream_register(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + u8 ep_stream, rp_stream; + int rc; + + if (ide->stream_id < 0 || ide->stream_id > U8_MAX) { + pci_err(pdev, "Setup fail: Invalid Stream ID: %d\n", ide->stream_id); + return -ENXIO; + } + + ep_stream = ide->partner[PCI_IDE_EP].stream_index; + rp_stream = ide->partner[PCI_IDE_RP].stream_index; + const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d", + ide->host_bridge_stream, + rp_stream, ep_stream); + if (!name) + return -ENOMEM; + + rc = sysfs_create_link(&hb->dev.kobj, &pdev->dev.kobj, name); + if (rc) + return rc; + + ide->name = no_free_ptr(name); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_register); + +/** + * pci_ide_stream_unregister() - unwind pci_ide_stream_register() + * @ide: idle IDE settings descriptor + * + * In preparation for freeing @ide, remove sysfs enumeration for the + * stream. + */ +void pci_ide_stream_unregister(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + + sysfs_remove_link(&hb->dev.kobj, ide->name); + kfree(ide->name); + ide->name = NULL; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_unregister); + +static int pci_ide_domain(struct pci_dev *pdev) +{ + if (pdev->fm_enabled) + return pci_domain_nr(pdev->bus); + return 0; +} + +struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide) +{ + if (!pci_is_pcie(pdev)) { + pci_warn_once(pdev, "not a PCIe device\n"); + return NULL; + } + + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_ENDPOINT: + if (pdev != ide->pdev) { + pci_warn_once(pdev, "setup expected Endpoint: %s\n", pci_name(ide->pdev)); + return NULL; + } + return &ide->partner[PCI_IDE_EP]; + case PCI_EXP_TYPE_ROOT_PORT: { + struct pci_dev *rp = pcie_find_root_port(ide->pdev); + + if (pdev != rp) { + pci_warn_once(pdev, "setup expected Root Port: %s\n", + pci_name(rp)); + return NULL; + } + return &ide->partner[PCI_IDE_RP]; + } + default: + pci_warn_once(pdev, "invalid device type\n"); + return NULL; + } +} +EXPORT_SYMBOL_GPL(pci_ide_to_settings); + +static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, + struct pci_ide_partner *settings, int pos, + bool enable) +{ + u32 val = FIELD_PREP(PCI_IDE_SEL_CTL_ID, ide->stream_id) | + FIELD_PREP(PCI_IDE_SEL_CTL_DEFAULT, settings->default_stream) | + FIELD_PREP(PCI_IDE_SEL_CTL_CFG_EN, pdev->ide_cfg) | + FIELD_PREP(PCI_IDE_SEL_CTL_TEE_LIMITED, pdev->ide_tee_limit) | + FIELD_PREP(PCI_IDE_SEL_CTL_EN, enable); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); +} + +/** + * pci_ide_stream_setup() - program settings to Selective IDE Stream registers + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * + * When @pdev is a PCI_EXP_TYPE_ENDPOINT then the PCI_IDE_EP partner + * settings are written to @pdev's Selective IDE Stream register block, + * and when @pdev is a PCI_EXP_TYPE_ROOT_PORT, the PCI_IDE_RP settings + * are selected. + */ +void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + u32 val; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val); + + val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | + FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | + FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val); + + /* + * Setup control register early for devices that expect + * stream_id is set during key programming. + */ + set_ide_sel_ctl(pdev, ide, settings, pos, false); + settings->setup = 1; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_setup); + +/** + * pci_ide_stream_teardown() - disable the stream and clear all settings + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * + * For stream destruction, zero all registers that may have been written + * by pci_ide_stream_setup(). Consider pci_ide_stream_disable() to leave + * settings in place while temporarily disabling the stream. + */ +void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0); + settings->setup = 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_teardown); + +/** + * pci_ide_stream_enable() - enable a Selective IDE Stream + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered and setup IDE settings descriptor + * + * Activate the stream by writing to the Selective IDE Stream Control + * Register. + * + * Return: 0 if the stream successfully entered the "secure" state, and -EINVAL + * if @ide is invalid, and -ENXIO if the stream fails to enter the secure state. + * + * Note that the state may go "insecure" at any point after returning 0, but + * those events are equivalent to a "link down" event and handled via + * asynchronous error reporting. + * + * Caller is responsible to clear the enable bit in the -ENXIO case. + */ +int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + u32 val; + + if (!settings) + return -EINVAL; + + pos = sel_ide_offset(pdev, settings); + + set_ide_sel_ctl(pdev, ide, settings, pos, true); + settings->enable = 1; + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_STS, &val); + if (FIELD_GET(PCI_IDE_SEL_STS_STATE, val) != + PCI_IDE_SEL_STS_STATE_SECURE) + return -ENXIO; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_enable); + +/** + * pci_ide_stream_disable() - disable a Selective IDE Stream + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered and setup IDE settings descriptor + * + * Clear the Selective IDE Stream Control Register, but leave all other + * registers untouched. + */ +void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int pos; + + if (!settings) + return; + + pos = sel_ide_offset(pdev, settings); + + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + settings->enable = 0; +} +EXPORT_SYMBOL_GPL(pci_ide_stream_disable); + +void pci_ide_init_host_bridge(struct pci_host_bridge *hb) +{ + hb->nr_ide_streams = 256; + ida_init(&hb->ide_stream_ida); +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 6e4cc1c9aa58..d3f16be40102 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -615,8 +615,10 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); +void pci_ide_init_host_bridge(struct pci_host_bridge *hb); #else static inline void pci_ide_init(struct pci_dev *dev) { } +static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } #endif #ifdef CONFIG_PCI_TSM diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 3b54f1720be5..93fa7ba8dfa6 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -672,6 +672,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_dpc = 1; bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET; bridge->native_cxl_error = 1; + pci_ide_init_host_bridge(bridge); device_initialize(&bridge->dev); } diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h new file mode 100644 index 000000000000..e638f9429bf9 --- /dev/null +++ b/include/linux/pci-ide.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common helpers for drivers (e.g. low-level PCI/TSM drivers) implementing the + * IDE key management protocol (IDE_KM) as defined by: + * PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) + * + * Copyright(c) 2024-2025 Intel Corporation. All rights reserved. + */ + +#ifndef __PCI_IDE_H__ +#define __PCI_IDE_H__ + +enum pci_ide_partner_select { + PCI_IDE_EP, + PCI_IDE_RP, + PCI_IDE_PARTNER_MAX, + /* + * In addition to the resources in each partner port the + * platform / host-bridge additionally has a Stream ID pool that + * it shares across root ports. Let pci_ide_stream_alloc() use + * the alloc_stream_index() helper as endpoints and root ports. + */ + PCI_IDE_HB = PCI_IDE_PARTNER_MAX, +}; + +/** + * struct pci_ide_partner - Per port pair Selective IDE Stream settings + * @rid_start: Partner Port Requester ID range start + * @rid_end: Partner Port Requester ID range end + * @stream_index: Selective IDE Stream Register Block selection + * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of + * address and RID association registers + * @setup: flag to track whether to run pci_ide_stream_teardown() for this + * partner slot + * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + */ +struct pci_ide_partner { + u16 rid_start; + u16 rid_end; + u8 stream_index; + unsigned int default_stream:1; + unsigned int setup:1; + unsigned int enable:1; +}; + +/** + * struct pci_ide - PCIe Selective IDE Stream descriptor + * @pdev: PCIe Endpoint in the pci_ide_partner pair + * @partner: per-partner settings + * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool + * @stream_id: unique Stream ID (within Partner Port pairing) + * @name: name of the established Selective IDE Stream in sysfs + * + * Negative @stream_id values indicate "uninitialized" on the + * expectation that with TSM established IDE the TSM owns the stream_id + * allocation. + */ +struct pci_ide { + struct pci_dev *pdev; + struct pci_ide_partner partner[PCI_IDE_PARTNER_MAX]; + u8 host_bridge_stream; + int stream_id; + const char *name; +}; + +struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, + struct pci_ide *ide); +struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); +void pci_ide_stream_free(struct pci_ide *ide); +int pci_ide_stream_register(struct pci_ide *ide); +void pci_ide_stream_unregister(struct pci_ide *ide); +void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide); +int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_release(struct pci_ide *ide); +DEFINE_FREE(pci_ide_stream_release, struct pci_ide *, if (_T) pci_ide_stream_release(_T)) +#endif /* __PCI_IDE_H__ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index ea94799c81b0..2c8dbae4916c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -545,6 +545,8 @@ struct pci_dev { u16 ide_cap; /* Link Integrity & Data Encryption */ u8 nr_ide_mem; /* Address association resources for streams */ u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + u16 nr_sel_ide; /* Selective Stream count (register block allocator) */ + struct ida ide_stream_ida; unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif @@ -614,6 +616,10 @@ struct pci_host_bridge { int domain_nr; struct list_head windows; /* resource_entry */ struct list_head dma_ranges; /* dma ranges resource list */ +#ifdef CONFIG_PCI_IDE + u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ + struct ida ide_stream_ida; +#endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); -- cgit v1.2.3 From 9ddaf9c3ed007cd03c1335fb40920ad76f72a3d5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:00 -0700 Subject: PCI/IDE: Report available IDE streams The limited number of link-encryption (IDE) streams that a given set of host bridges supports is a platform specific detail. Provide pci_ide_init_nr_streams() as a generic facility for either platform TSM drivers, or PCI core native IDE, to report the number available streams. After invoking pci_ide_init_nr_streams() an "available_secure_streams" attribute appears in PCI host bridge sysfs to convey that count. Introduce a device-type, @pci_host_bridge_type, now that both a release method and sysfs attribute groups are being specified for all 'struct pci_host_bridge' instances. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-9-dan.j.williams@intel.com Signed-off-by: Dan Williams --- .../ABI/testing/sysfs-devices-pci-host-bridge | 12 ++++ drivers/pci/ide.c | 67 ++++++++++++++++++++++ drivers/pci/pci.h | 1 + drivers/pci/probe.c | 14 ++++- include/linux/pci-ide.h | 1 + 5 files changed, 94 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge index 2c66e5bb2bf8..b91ec3450811 100644 --- a/Documentation/ABI/testing/sysfs-devices-pci-host-bridge +++ b/Documentation/ABI/testing/sysfs-devices-pci-host-bridge @@ -31,3 +31,15 @@ Description: platform specific pool of stream resources shared by the Root Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for details about the DDDD:BB format. + +What: pciDDDD:BB/available_secure_streams +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a host bridge has Root Ports that support PCIe IDE + (link encryption and integrity protection) there may be a + limited number of Selective IDE Streams that can be used for + establishing new end-to-end secure links. This attribute + decrements upon secure link setup, and increments upon secure + link teardown. The in-use stream count is determined by counting + stream symlinks. See /sys/devices/pciDDDD:BB entry for details + about the DDDD:BB format. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 7643840738fe..4ae3872589fc 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -514,3 +514,70 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb) hb->nr_ide_streams = 256; ida_init(&hb->ide_stream_ida); } + +static ssize_t available_secure_streams_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_host_bridge *hb = to_pci_host_bridge(dev); + int nr = READ_ONCE(hb->nr_ide_streams); + int avail = nr; + + if (!nr) + return -ENXIO; + + /* + * Yes, this is inefficient and racy, but it is only for occasional + * platform resource surveys. Worst case is bounded to 256 streams. + */ + for (int i = 0; i < nr; i++) + if (ida_exists(&hb->ide_stream_ida, i)) + avail--; + return sysfs_emit(buf, "%d\n", avail); +} +static DEVICE_ATTR_RO(available_secure_streams); + +static struct attribute *pci_ide_attrs[] = { + &dev_attr_available_secure_streams.attr, + NULL +}; + +static umode_t pci_ide_attr_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_host_bridge *hb = to_pci_host_bridge(dev); + + if (a == &dev_attr_available_secure_streams.attr) + if (!hb->nr_ide_streams) + return 0; + + return a->mode; +} + +const struct attribute_group pci_ide_attr_group = { + .attrs = pci_ide_attrs, + .is_visible = pci_ide_attr_visible, +}; + +/** + * pci_ide_set_nr_streams() - sets size of the pool of IDE Stream resources + * @hb: host bridge boundary for the stream pool + * @nr: number of streams + * + * Platform PCI init and/or expert test module use only. Limit IDE + * Stream establishment by setting the number of stream resources + * available at the host bridge. Platform init code must set this before + * the first pci_ide_stream_alloc() call if the platform has less than the + * default of 256 streams per host-bridge. + * + * The "PCI_IDE" symbol namespace is required because this is typically + * a detail that is settled in early PCI init. I.e. this export is not + * for endpoint drivers. + */ +void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr) +{ + hb->nr_ide_streams = min(nr, 256); + WARN_ON_ONCE(!ida_is_empty(&hb->ide_stream_ida)); + sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group); +} +EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE"); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d3f16be40102..f6ffe5ee4717 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -616,6 +616,7 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); void pci_ide_init_host_bridge(struct pci_host_bridge *hb); +extern const struct attribute_group pci_ide_attr_group; #else static inline void pci_ide_init(struct pci_dev *dev) { } static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 93fa7ba8dfa6..cfacf5bcd073 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -653,6 +653,18 @@ static void pci_release_host_bridge_dev(struct device *dev) kfree(bridge); } +static const struct attribute_group *pci_host_bridge_groups[] = { +#ifdef CONFIG_PCI_IDE + &pci_ide_attr_group, +#endif + NULL +}; + +static const struct device_type pci_host_bridge_type = { + .groups = pci_host_bridge_groups, + .release = pci_release_host_bridge_dev, +}; + static void pci_init_host_bridge(struct pci_host_bridge *bridge) { INIT_LIST_HEAD(&bridge->windows); @@ -672,6 +684,7 @@ static void pci_init_host_bridge(struct pci_host_bridge *bridge) bridge->native_dpc = 1; bridge->domain_nr = PCI_DOMAIN_NR_NOT_SET; bridge->native_cxl_error = 1; + bridge->dev.type = &pci_host_bridge_type; pci_ide_init_host_bridge(bridge); device_initialize(&bridge->dev); @@ -686,7 +699,6 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv) return NULL; pci_init_host_bridge(bridge); - bridge->dev.release = pci_release_host_bridge_dev; return bridge; } diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index e638f9429bf9..85645b0a8620 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -63,6 +63,7 @@ struct pci_ide { const char *name; }; +void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); -- cgit v1.2.3 From a4438f06b1db15ce3d831ce82b8767665638aa2a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:01 -0700 Subject: PCI/TSM: Report active IDE streams Given that the platform TSM owns IDE Stream ID allocation, report the active streams via the TSM class device. Establish a symlink from the class device to the PCI endpoint device consuming the stream, named by the Stream ID. Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-10-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-class-tsm | 10 ++++++++++ drivers/pci/ide.c | 4 ++++ drivers/virt/coco/tsm-core.c | 28 ++++++++++++++++++++++++++++ include/linux/pci-ide.h | 2 ++ include/linux/tsm.h | 3 +++ 5 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-tsm b/Documentation/ABI/testing/sysfs-class-tsm index 2949468deaf7..6fc1a5ac6da1 100644 --- a/Documentation/ABI/testing/sysfs-class-tsm +++ b/Documentation/ABI/testing/sysfs-class-tsm @@ -7,3 +7,13 @@ Description: signals when the PCI layer is able to support establishment of link encryption and other device-security features coordinated through a platform tsm. + +What: /sys/class/tsm/tsmN/streamH.R.E +Contact: linux-pci@vger.kernel.org +Description: + (RO) When a host bridge has established a secure connection via + the platform TSM, symlink appears. The primary function of this + is have a system global review of TSM resource consumption + across host bridges. The link points to the endpoint PCI device + and matches the same link published by the host bridge. See + Documentation/ABI/testing/sysfs-devices-pci-host-bridge. diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 4ae3872589fc..da5b1acccbb4 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "pci.h" @@ -261,6 +262,9 @@ void pci_ide_stream_release(struct pci_ide *ide) if (ide->partner[PCI_IDE_EP].enable) pci_ide_stream_disable(pdev, ide); + if (ide->tsm_dev) + tsm_ide_stream_unregister(ide); + if (ide->partner[PCI_IDE_RP].setup) pci_ide_stream_teardown(rp, ide); diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c index 0e705f3067a1..f027876a2f19 100644 --- a/drivers/virt/coco/tsm-core.c +++ b/drivers/virt/coco/tsm-core.c @@ -4,11 +4,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include #include #include +#include static struct class *tsm_class; static DECLARE_RWSEM(tsm_rwsem); @@ -106,6 +108,32 @@ void tsm_unregister(struct tsm_dev *tsm_dev) } EXPORT_SYMBOL_GPL(tsm_unregister); +/* must be invoked between tsm_register / tsm_unregister */ +int tsm_ide_stream_register(struct pci_ide *ide) +{ + struct pci_dev *pdev = ide->pdev; + struct pci_tsm *tsm = pdev->tsm; + struct tsm_dev *tsm_dev = tsm->tsm_dev; + int rc; + + rc = sysfs_create_link(&tsm_dev->dev.kobj, &pdev->dev.kobj, ide->name); + if (rc) + return rc; + + ide->tsm_dev = tsm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_ide_stream_register); + +void tsm_ide_stream_unregister(struct pci_ide *ide) +{ + struct tsm_dev *tsm_dev = ide->tsm_dev; + + ide->tsm_dev = NULL; + sysfs_remove_link(&tsm_dev->dev.kobj, ide->name); +} +EXPORT_SYMBOL_GPL(tsm_ide_stream_unregister); + static void tsm_release(struct device *dev) { struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev); diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 85645b0a8620..d0f10f3c89fc 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -50,6 +50,7 @@ struct pci_ide_partner { * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool * @stream_id: unique Stream ID (within Partner Port pairing) * @name: name of the established Selective IDE Stream in sysfs + * @tsm_dev: For TSM established IDE, the TSM device context * * Negative @stream_id values indicate "uninitialized" on the * expectation that with TSM established IDE the TSM owns the stream_id @@ -61,6 +62,7 @@ struct pci_ide { u8 host_bridge_stream; int stream_id; const char *name; + struct tsm_dev *tsm_dev; }; void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 22e05b2aac69..a3b7ab668eff 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -123,4 +123,7 @@ int tsm_report_unregister(const struct tsm_report_ops *ops); struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); struct tsm_dev *find_tsm_dev(int id); +struct pci_ide; +int tsm_ide_stream_register(struct pci_ide *ide); +void tsm_ide_stream_unregister(struct pci_ide *ide); #endif /* __TSM_H */ -- cgit v1.2.3 From 7c5b184db7145fd417785377337bd15c4fe1d0f4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:29:59 -0400 Subject: genpt: Generic Page Table base API The generic API is intended to be separated from the implementation of page table algorithms. It contains only accessors for walking and manipulating the table and helpers that are useful for building an implementation. Memory management is not in the generic API, but part of the implementation. Using a multi-compilation approach the implementation module would include headers in this order: common.h defs_FMT.h pt_defs.h FMT.h pt_common.h IMPLEMENTATION.h Where each compilation unit would have a combination of FMT and IMPLEMENTATION to produce a per-format per-implementation module. The API is designed so that the format headers have minimal logic, and default implementations are provided if the format doesn't include one. Generally formats provide their code via an inline function using the pattern: static inline FMTpt_XX(..) {} #define pt_XX FMTpt_XX The common code then enforces a function signature so that there is no drift in function arguments, or accidental polymorphic functions (as has been slightly troublesome in mm). Use of function-like #defines are avoided in the format even though many of the functions are small enough. Provide kdocs for the API surface. This is enough to implement the 8 initial format variations with all of their features: * Entries comprised of contiguous blocks of IO PTEs for larger page sizes (AMDv1, ARMv8) * Multi-level tables, up to 6 levels. Runtime selected top level * The size of the top table level can be selected at runtime (ARM's concatenated tables) * The number of levels in the table can optionally increase dynamically during map (AMDv1) * Optional leaf entries at any level * 32 bit/64 bit virtual and output addresses, using every bit * Sign extended addressing (x86) * Dirty tracking A basic simple format takes about 200 lines to declare the require inline functions. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- .clang-format | 1 + drivers/iommu/Kconfig | 2 + drivers/iommu/generic_pt/Kconfig | 20 + drivers/iommu/generic_pt/pt_common.h | 358 ++++++++++++++++ drivers/iommu/generic_pt/pt_defs.h | 329 +++++++++++++++ drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 +++++++++++ drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++++++++++++++++++++++ drivers/iommu/generic_pt/pt_log2.h | 122 ++++++ include/linux/generic_pt/common.h | 135 ++++++ 9 files changed, 1836 insertions(+) create mode 100644 drivers/iommu/generic_pt/Kconfig create mode 100644 drivers/iommu/generic_pt/pt_common.h create mode 100644 drivers/iommu/generic_pt/pt_defs.h create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h create mode 100644 drivers/iommu/generic_pt/pt_iter.h create mode 100644 drivers/iommu/generic_pt/pt_log2.h create mode 100644 include/linux/generic_pt/common.h (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index f371a13b4d19..9e6a9177f8fb 100644 --- a/.clang-format +++ b/.clang-format @@ -415,6 +415,7 @@ ForEachMacros: - 'for_each_prop_dlc_cpus' - 'for_each_prop_dlc_platforms' - 'for_each_property_of_node' + - 'for_each_pt_level_entry' - 'for_each_rdt_resource' - 'for_each_reg' - 'for_each_reg_filtered' diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 70d29b14d851..c9ae3221cd6f 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -384,3 +384,5 @@ config SPRD_IOMMU Say Y here if you want to use the multimedia devices listed above. endif # IOMMU_SUPPORT + +source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig new file mode 100644 index 000000000000..fb0f431ddba0 --- /dev/null +++ b/drivers/iommu/generic_pt/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menuconfig GENERIC_PT + bool "Generic Radix Page Table" + help + Generic library for building radix tree page tables. + + Generic PT provides a set of HW page table formats and a common + set of APIs to work with them. + +if GENERIC_PT +config DEBUG_GENERIC_PT + bool "Extra debugging checks for GENERIC_PT" + help + Enable extra run time debugging checks for GENERIC_PT code. This + incurs a runtime cost and should not be enabled for production + kernels. + + The kunit tests require this to be enabled to get full coverage. +endif diff --git a/drivers/iommu/generic_pt/pt_common.h b/drivers/iommu/generic_pt/pt_common.h new file mode 100644 index 000000000000..f64f800725db --- /dev/null +++ b/drivers/iommu/generic_pt/pt_common.h @@ -0,0 +1,358 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included after the format. It contains definitions + * that build on the format definitions to create the basic format API. + * + * The format API is listed here, with kdocs. The functions without bodies are + * implemented in the format using the pattern: + * static inline FMTpt_XXX(..) {..} + * #define pt_XXX FMTpt_XXX + * + * If the format doesn't implement a function then pt_fmt_defaults.h can provide + * a generic version. + * + * The routines marked "@pts: Entry to query" operate on the entire contiguous + * entry and can be called with a pts->index pointing to any sub item that makes + * up that entry. + * + * The header order is: + * pt_defs.h + * FMT.h + * pt_common.h + */ +#ifndef __GENERIC_PT_PT_COMMON_H +#define __GENERIC_PT_PT_COMMON_H + +#include "pt_defs.h" +#include "pt_fmt_defaults.h" + +/** + * pt_attr_from_entry() - Convert the permission bits back to attrs + * @pts: Entry to convert from + * @attrs: Resulting attrs + * + * Fill in the attrs with the permission bits encoded in the current leaf entry. + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the + * same entry. + */ +static inline void pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs); + +/** + * pt_can_have_leaf() - True if the current level can have an OA entry + * @pts: The current level + * + * True if the current level can support pt_install_leaf_entry(). A leaf + * entry produce an OA. + */ +static inline bool pt_can_have_leaf(const struct pt_state *pts); + +/** + * pt_can_have_table() - True if the current level can have a lower table + * @pts: The current level + * + * Every level except 0 is allowed to have a lower table. + */ +static inline bool pt_can_have_table(const struct pt_state *pts) +{ + /* No further tables at level 0 */ + return pts->level > 0; +} + +/** + * pt_clear_entries() - Make entries empty (non-present) + * @pts: Starting table index + * @num_contig_lg2: Number of contiguous items to clear + * + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY + * and does not have any effect on table walking. The starting index must be + * aligned to num_contig_lg2. + */ +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2); + +/** + * pt_entry_make_write_dirty() - Make an entry dirty + * @pts: Table entry to change + * + * Make pt_entry_is_write_dirty() return true for this entry. This can be called + * asynchronously with any other table manipulation under a RCU lock and must + * not corrupt the table. + */ +static inline bool pt_entry_make_write_dirty(struct pt_state *pts); + +/** + * pt_entry_make_write_clean() - Make the entry write clean + * @pts: Table entry to change + * + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will + * eventually be notified of this change via a TLB flush, which is the point + * that the HW must become synchronized. Any "write dirty" prior to the TLB + * flush can be lost, but once the TLB flush completes all writes must make + * their entries write dirty. + * + * The format should alter the entry in a way that is compatible with any + * concurrent update from HW. The entire contiguous entry is changed. + */ +static inline void pt_entry_make_write_clean(struct pt_state *pts); + +/** + * pt_entry_is_write_dirty() - True if the entry has been written to + * @pts: Entry to query + * + * "write dirty" means that the HW has written to the OA translated + * by this entry. If the entry is contiguous then the consolidated + * "write dirty" for all the items must be returned. + */ +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); + +/** + * pt_dirty_supported() - True if the page table supports dirty tracking + * @common: Page table to query + */ +static inline bool pt_dirty_supported(struct pt_common *common); + +/** + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry + * @pts: Entry to query + * + * Return the number of contiguous items this leaf entry spans. If the entry + * is single item it returns ilog2(1). + */ +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); + +/** + * pt_entry_oa() - Output Address for this leaf entry + * @pts: Entry to query + * + * Return the output address for the start of the entry. If the entry + * is contiguous this returns the same value for each sub-item. I.e.:: + * + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 + * + * See pt_item_oa(). The format should implement one of these two functions + * depending on how it stores the OAs in the table. + */ +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); + +/** + * pt_entry_oa_lg2sz() - Return the size of an OA entry + * @pts: Entry to query + * + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise + * it returns the total VA/OA size of the entire contiguous entry. + */ +static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) +{ + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); +} + +/** + * pt_entry_oa_exact() - Return the complete OA for an entry + * @pts: Entry to query + * + * During iteration the first entry could have a VA with an offset from the + * natural start of the entry. Return the exact OA including the pts's VA + * offset. + */ +static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) +{ + return _pt_entry_oa_fast(pts) | + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); +} + +/** + * pt_full_va_prefix() - The top bits of the VA + * @common: Page table to query + * + * This is usually 0, but some formats have their VA space going downward from + * PT_VADDR_MAX, and will return that instead. This value must always be + * adjusted by struct pt_common max_vasz_lg2. + */ +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); + +/** + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry + * @common: Page table to query + * + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). + * This is useful to create optimized paths for common cases of PAGE_SIZE + * mappings. + */ +static inline bool pt_has_system_page_size(const struct pt_common *common); + +/** + * pt_install_leaf_entry() - Write a leaf entry to the table + * @pts: Table index to change + * @oa: Output Address for this leaf + * @oasz_lg2: Size in VA/OA for this leaf + * @attrs: Attributes to modify the entry + * + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates + * the VA indicated by pts to the given OA. + * + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. + * + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes + * not indicated by pt_possible_sizes() must not be specified. + */ +static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs); + +/** + * pt_install_table() - Write a table entry to the table + * @pts: Table index to change + * @table_pa: CPU physical address of the lower table's memory + * @attrs: Attributes to modify the table index + * + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the + * current entry loaded. The pts is updated with the installed entry. + * + * This must not be called if pt_can_have_table() == false. + * + * Returns: true if the table was installed successfully. + */ +static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs); + +/** + * pt_item_oa() - Output Address for this leaf item + * @pts: Item to query + * + * Return the output address for this item. If the item is part of a contiguous + * entry it returns the value of the OA for this individual sub item. + * + * See pt_entry_oa(). The format should implement one of these two functions + * depending on how it stores the OA's in the table. + */ +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); + +/** + * pt_load_entry_raw() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Return the type of entry that was loaded. pts->entry will be filled in with + * the entry's content. See pt_load_entry() + */ +static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); + +/** + * pt_max_oa_lg2() - Return the maximum OA the table format can hold + * @common: Page table to query + * + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the + * OA. This is the absolute maximum address the table can hold. struct pt_common + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. + */ +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common); + +/** + * pt_num_items_lg2() - Return the number of items in this table level + * @pts: The current level + * + * The number of items in a table level defines the number of bits this level + * decodes from the VA. This function is not called for the top level, + * so it does not need to compute a special value for the top case. The + * result for the top is based on pt_common max_vasz_lg2. + * + * The value is used as part of determining the table indexes via the + * equation:: + * + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) + */ +static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); + +/** + * pt_pgsz_lg2_to_level - Return the level that maps the page size + * @common: Page table to query + * @pgsize_lg2: Log2 page size + * + * Returns the table level that will map the given page size. The page + * size must be part of the pt_possible_sizes() for some level. + */ +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2); + +/** + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level + * @pts: The current level + * + * Each level has a list of possible output sizes that can be installed as + * leaf entries. If pt_can_have_leaf() is false returns zero. + * + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating + * that a non-contiguous single item leaf entry is supported. The following + * pt_num_items_lg2() number of bits can be set indicating contiguous entries + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be + * set, contiguous entries cannot span the entire table. + * + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all + * supported sizes in the entire table. + */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); + +/** + * pt_table_item_lg2sz() - Size of a single item entry in this table level + * @pts: The current level + * + * The size of the item specifies how much VA and OA a single item occupies. + * + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous + * entries. + */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +/** + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table + * @pts: The current level + * + * Return the size of VA decoded by the entire table level. + */ +static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) +{ + if (pts->range->top_level == pts->level) + return pts->range->max_vasz_lg2; + return min_t(unsigned int, pts->range->common->max_vasz_lg2, + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); +} + +/** + * pt_table_pa() - Return the CPU physical address of the table entry + * @pts: Entry to query + * + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same + * value passed to pt_install_table(). + */ +static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); + +/** + * pt_table_ptr() - Return a CPU pointer for a table item + * @pts: Entry to query + * + * Same as pt_table_pa() but returns a CPU pointer. + */ +static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) +{ + return __va(pt_table_pa(pts)); +} + +/** + * pt_load_entry() - Read from the location pts points at into the pts + * @pts: Table index to load + * + * Set the type of entry that was loaded. pts->entry and pts->table_lower + * will be filled in with the entry's content. + */ +static inline void pt_load_entry(struct pt_state *pts) +{ + pts->type = pt_load_entry_raw(pts); + if (pts->type == PT_ENTRY_TABLE) + pts->table_lower = pt_table_ptr(pts); +} +#endif diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h new file mode 100644 index 000000000000..819057de50d8 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * This header is included before the format. It contains definitions + * that are required to compile the format. The header order is: + * pt_defs.h + * fmt_XX.h + * pt_common.h + */ +#ifndef __GENERIC_PT_DEFS_H +#define __GENERIC_PT_DEFS_H + +#include + +#include +#include +#include +#include +#include +#include +#include "pt_log2.h" + +/* Header self-compile default defines */ +#ifndef pt_write_attrs +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; +#endif + +struct pt_table_p; + +enum { + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, +}; + +/* + * The format instantiation can have features wired off or on to optimize the + * code gen. Supported features are just a reflection of what the current set of + * kernel users want to use. + */ +#ifndef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES 0 +#endif + +/* + * When in debug mode we compile all formats with all features. This allows the + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or + * FULL_VA. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +enum { + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, + PT_DEBUG_SUPPORTED_FEATURES = + UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : + BIT(PT_FEAT_SIGN_EXTEND)), +}; +#undef PT_SUPPORTED_FEATURES +#define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES +#endif + +#ifndef PT_FORCE_ENABLED_FEATURES +#define PT_FORCE_ENABLED_FEATURES 0 +#endif + +/** + * DOC: Generic Page Table Language + * + * Language used in Generic Page Table + * VA + * The input address to the page table, often the virtual address. + * OA + * The output address from the page table, often the physical address. + * leaf + * An entry that results in an output address. + * start/end + * An half-open range, e.g. [0,0) refers to no VA. + * start/last + * An inclusive closed range, e.g. [0,0] refers to the VA 0 + * common + * The generic page table container struct pt_common + * level + * Level 0 is always a table of only leaves with no futher table pointers. + * Increasing levels increase the size of the table items. The least + * significant VA bits used to index page tables are used to index the Level + * 0 table. The various labels for table levels used by HW descriptions are + * not used. + * top_level + * The inclusive highest level of the table. A two-level table + * has a top level of 1. + * table + * A linear array of translation items for that level. + * index + * The position in a table of an element: item = table[index] + * item + * A single index in a table + * entry + * A single logical element in a table. If contiguous pages are not + * supported then item and entry are the same thing, otherwise entry refers + * to all the items that comprise a single contiguous translation. + * item/entry_size + * The number of bytes of VA the table index translates for. + * If the item is a table entry then the next table covers + * this size. If the entry translates to an output address then the + * full OA is: OA | (VA % entry_size) + * contig_count + * The number of consecutive items fused into a single entry. + * item_size * contig_count is the size of that entry's translation. + * lg2 + * Indicates the value is encoded as log2, i.e. 1<table)) + +/* + * Try to install a new table pointer. The locking methodology requires this to + * be atomic (multiple threads can race to install a pointer). The losing + * threads will fail the atomic and return false. They should free any memory + * and reparse the table level again. + */ +#if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) +static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) +{ + u64 *entryp = pt_cur_table(pts, u64) + pts->index; + u64 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} +#endif + +static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) +{ + u32 *entryp = pt_cur_table(pts, u32) + pts->index; + u32 old_entry = pts->entry; + bool ret; + + /* + * Ensure the zero'd table content itself is visible before its PTE can + * be. release is a NOP on !SMP, but the HW is still doing an acquire. + */ + if (!IS_ENABLED(CONFIG_SMP)) + dma_wmb(); + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); + if (ret) + pts->entry = table_entry; + return ret; +} + +#define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) + +static inline bool pt_feature(const struct pt_common *common, + unsigned int feature_nr) +{ + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) + return true; + if (!PT_SUPPORTED_FEATURE(feature_nr)) + return false; + return common->features & BIT(feature_nr); +} + +static inline bool pts_feature(const struct pt_state *pts, + unsigned int feature_nr) +{ + return pt_feature(pts->range->common, feature_nr); +} + +/* + * PT_WARN_ON is used for invariants that the kunit should be checking can't + * happen. + */ +#if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) +#define PT_WARN_ON WARN_ON +#else +static inline bool PT_WARN_ON(bool condition) +{ + return false; +} +#endif + +/* These all work on the VA type */ +#define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) +#define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) +#define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) +#define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) +#define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) +#define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) +#define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) +#define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) +#define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) +#define vaffs(a) ffs_t(pt_vaddr_t, a) +#define vafls(a) fls_t(pt_vaddr_t, a) +#define vaffz(a) ffz_t(pt_vaddr_t, a) + +/* + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and + * generate a useful defined result. The non-fva versions will malfunction at + * this extreme. + */ +static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return 0; + return log2_div_t(pt_vaddr_t, a, b_lg2); +} + +static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return a; + return log2_mod_t(pt_vaddr_t, a, b_lg2); +} + +static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, + unsigned int c_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) + return true; + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, + unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return val; + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); +} + +static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) + return PT_VADDR_MAX; + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); +} + +/* These all work on the OA type */ +#define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) +#define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) +#define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) +#define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) +#define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) +#define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) +#define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) +#define oaffs(a) ffs_t(pt_oaddr_t, a) +#define oafls(a) fls_t(pt_oaddr_t, a) +#define oaffz(a) ffz_t(pt_oaddr_t, a) + +static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, + unsigned int top_level) +{ + return top_level | (uintptr_t)table_mem; +} + +static inline void pt_top_set(struct pt_common *common, + struct pt_table_p *table_mem, + unsigned int top_level) +{ + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); +} + +static inline void pt_top_set_level(struct pt_common *common, + unsigned int top_level) +{ + pt_top_set(common, NULL, top_level); +} + +static inline unsigned int pt_top_get_level(const struct pt_common *common) +{ + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); +} + +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2); + +#endif diff --git a/drivers/iommu/generic_pt/pt_fmt_defaults.h b/drivers/iommu/generic_pt/pt_fmt_defaults.h new file mode 100644 index 000000000000..60d594bbb106 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_fmt_defaults.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Default definitions for formats that don't define these functions. + */ +#ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H +#define __GENERIC_PT_PT_FMT_DEFAULTS_H + +#include "pt_defs.h" +#include + +/* Header self-compile default defines */ +#ifndef pt_load_entry_raw +#include "fmt/amdv1.h" +#endif + +/* + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. + */ +#ifndef pt_table_item_lg2sz +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) +{ + return PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; +} +#endif + +#ifndef pt_pgsz_lg2_to_level +static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, + unsigned int pgsize_lg2) +{ + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); +} +#endif + +/* + * If not supplied by the format then contiguous pages are not supported. + * + * If contiguous pages are supported then the format must also provide + * pt_contig_count_lg2() if it supports a single contiguous size per level, + * or pt_possible_sizes() if it supports multiple sizes per level. + */ +#ifndef pt_entry_num_contig_lg2 +static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} + +/* + * Return the number of contiguous OA items forming an entry at this table level + */ +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) +{ + return ilog2(1); +} +#endif + +/* If not supplied by the format then dirty tracking is not supported */ +#ifndef pt_entry_is_write_dirty +static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) +{ + return false; +} + +static inline void pt_entry_make_write_clean(struct pt_state *pts) +{ +} + +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return false; +} +#else +/* If not supplied then dirty tracking is always enabled */ +#ifndef pt_dirty_supported +static inline bool pt_dirty_supported(struct pt_common *common) +{ + return true; +} +#endif +#endif + +#ifndef pt_entry_make_write_dirty +static inline bool pt_entry_make_write_dirty(struct pt_state *pts) +{ + return false; +} +#endif + +/* + * Format supplies either: + * pt_entry_oa - OA is at the start of a contiguous entry + * or + * pt_item_oa - OA is adjusted for every item in a contiguous entry + * + * Build the missing one + * + * The internal helper _pt_entry_oa_fast() allows generating + * an efficient pt_entry_oa_exact(), it doesn't care which + * option is selected. + */ +#ifdef pt_entry_oa +static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) +{ + return pt_entry_oa(pts) | + log2_mul(pts->index, pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_entry_oa +#endif + +#ifdef pt_item_oa +static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) +{ + return log2_set_mod(pt_item_oa(pts), 0, + pt_entry_num_contig_lg2(pts) + + pt_table_item_lg2sz(pts)); +} +#define _pt_entry_oa_fast pt_item_oa +#endif + +/* + * If not supplied by the format then use the constant + * PT_MAX_OUTPUT_ADDRESS_LG2. + */ +#ifndef pt_max_oa_lg2 +static inline unsigned int +pt_max_oa_lg2(const struct pt_common *common) +{ + return PT_MAX_OUTPUT_ADDRESS_LG2; +} +#endif + +#ifndef pt_has_system_page_size +static inline bool pt_has_system_page_size(const struct pt_common *common) +{ + return PT_GRANULE_LG2SZ == PAGE_SHIFT; +} +#endif + +/* + * If not supplied by the format then assume only one contiguous size determined + * by pt_contig_count_lg2() + */ +#ifndef pt_possible_sizes +static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); + +/* Return a bitmap of possible leaf page sizes at this level */ +static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!pt_can_have_leaf(pts)) + return 0; + return log2_to_int(isz_lg2) | + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); +} +#endif + +/* If not supplied by the format then use 0. */ +#ifndef pt_full_va_prefix +static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) +{ + return 0; +} +#endif + +/* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ +#ifndef pt_clear_entries +static inline void pt_clear_entries64(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries32(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u32 *tablep = pt_cur_table(pts, u32) + pts->index; + u32 *end = tablep + log2_to_int(num_contig_lg2); + + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); +} + +static inline void pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + if (PT_ITEM_WORD_SIZE == sizeof(u32)) + pt_clear_entries32(pts, num_contig_lg2); + else + pt_clear_entries64(pts, num_contig_lg2); +} +#define pt_clear_entries pt_clear_entries +#endif + +/* + * Format can call in the pt_install_leaf_entry() to check the arguments are all + * aligned correctly. + */ +static inline bool pt_check_install_leaf_args(struct pt_state *pts, + pt_oaddr_t oa, + unsigned int oasz_lg2) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) + return false; + +#ifdef pt_possible_sizes + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) + return false; +#else + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) + return false; +#endif + + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) + return false; + return true; +} + +#endif diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h new file mode 100644 index 000000000000..87f4a26c1a41 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -0,0 +1,636 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Iterators for Generic Page Table + */ +#ifndef __GENERIC_PT_PT_ITER_H +#define __GENERIC_PT_PT_ITER_H + +#include "pt_common.h" + +#include + +/* + * Use to mangle symbols so that backtraces and the symbol table are + * understandable. Any non-inlined function should get mangled like this. + */ +#define NS(fn) CONCATENATE(PTPFX, fn) + +/** + * pt_check_range() - Validate the range can be iterated + * @range: Range to validate + * + * Check that VA and last_va fall within the permitted range of VAs. If the + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension + * is correct. + */ +static inline int pt_check_range(struct pt_range *range) +{ + pt_vaddr_t prefix; + + PT_WARN_ON(!range->max_vasz_lg2); + + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? + PT_VADDR_MAX : + 0; + } else { + prefix = pt_full_va_prefix(range->common); + } + + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) + return -ERANGE; + return 0; +} + +/** + * pt_index_to_va() - Update range->va to the current pts->index + * @pts: Iteration State + * + * Adjust range->va to match the current index. This is done in a lazy manner + * since computing the VA takes several instructions and is rarely required. + */ +static inline void pt_index_to_va(struct pt_state *pts) +{ + pt_vaddr_t lower_va; + + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, + pt_table_oa_lg2sz(pts)); +} + +/* + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be + * adjusted to the end of the contiguous block if it is currently in the middle. + */ +static inline void _pt_advance(struct pt_state *pts, + unsigned int index_count_lg2) +{ + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, + index_count_lg2); +} + +/** + * pt_entry_fully_covered() - Check if the item or entry is entirely contained + * within pts->range + * @pts: Iteration State + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or + * pt_entry_oa_lg2sz() + * + * Returns: true if the item is fully enclosed by the pts->range. + */ +static inline bool pt_entry_fully_covered(const struct pt_state *pts, + unsigned int oasz_lg2) +{ + struct pt_range *range = pts->range; + + /* Range begins at the start of the entry */ + if (log2_mod(pts->range->va, oasz_lg2)) + return false; + + /* Range ends past the end of the entry */ + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) + return true; + + /* Range ends at the end of the entry */ + return log2_mod_eq_max(range->last_va, oasz_lg2); +} + +/** + * pt_range_to_index() - Starting index for an iteration + * @pts: Iteration State + * + * Return: the starting index for the iteration in pts. + */ +static inline unsigned int pt_range_to_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + PT_WARN_ON(pts->level > pts->range->top_level); + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->va, + pts->range->max_vasz_lg2), + isz_lg2); + return log2_mod(log2_div(pts->range->va, isz_lg2), + pt_num_items_lg2(pts)); +} + +/** + * pt_range_to_end_index() - Ending index iteration + * @pts: Iteration State + * + * Return: the last index for the iteration in pts. + */ +static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + struct pt_range *range = pts->range; + unsigned int num_entries_lg2; + + if (range->va == range->last_va) + return pts->index + 1; + + if (pts->range->top_level == pts->level) + return log2_div(fvalog2_mod(pts->range->last_va, + pts->range->max_vasz_lg2), + isz_lg2) + + 1; + + num_entries_lg2 = pt_num_items_lg2(pts); + + /* last_va falls within this table */ + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) + return log2_mod(log2_div(pts->range->last_va, isz_lg2), + num_entries_lg2) + + 1; + + return log2_to_int(num_entries_lg2); +} + +static inline void _pt_iter_first(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pts->end_index = pt_range_to_end_index(pts); + PT_WARN_ON(pts->index > pts->end_index); +} + +static inline bool _pt_iter_load(struct pt_state *pts) +{ + if (pts->index >= pts->end_index) + return false; + pt_load_entry(pts); + return true; +} + +/** + * pt_next_entry() - Advance pts to the next entry + * @pts: Iteration State + * + * Update pts to go to the next index at this level. If pts is pointing at a + * contiguous entry then the index may advance my more than one. + */ +static inline void pt_next_entry(struct pt_state *pts) +{ + if (pts->type == PT_ENTRY_OA && + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); + else + pts->index++; + pt_index_to_va(pts); +} + +/** + * for_each_pt_level_entry() - For loop wrapper over entries in the range + * @pts: Iteration State + * + * This is the basic iteration primitive. It iterates over all the entries in + * pts->range that fall within the pts's current table level. Each step does + * pt_load_entry(pts). + */ +#define for_each_pt_level_entry(pts) \ + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) + +/** + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker + * @pts: Iteration State + * + * Alternative to for_each_pt_level_entry() if the walker function uses only a + * single entry. + */ +static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) +{ + pts->index = pt_range_to_index(pts); + pt_load_entry(pts); + return pts->type; +} + +static __always_inline struct pt_range _pt_top_range(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = { + .common = common, + .top_table = + (struct pt_table_p *)(top_of_table & + ~(uintptr_t)PT_TOP_LEVEL_MASK), + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), + }; + struct pt_state pts = { .range = &range, .level = range.top_level }; + unsigned int max_vasz_lg2; + + max_vasz_lg2 = common->max_vasz_lg2; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + pts.level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, + pt_num_items_lg2(&pts) + + pt_table_item_lg2sz(&pts)); + + /* + * The top range will default to the lower region only with sign extend. + */ + range.max_vasz_lg2 = max_vasz_lg2; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) + max_vasz_lg2--; + + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); + range.last_va = + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); + return range; +} + +/** + * pt_top_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static __always_inline struct pt_range pt_top_range(struct pt_common *common) +{ + /* + * The top pointer can change without locking. We capture the value and + * it's level here and are safe to walk it so long as both values are + * captured without tearing. + */ + return _pt_top_range(common, READ_ONCE(common->top_of_table)); +} + +/** + * pt_all_range() - Return a range that spans the entire page table + * @common: Table + * + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND + * is supported range->va and range->last_va will be incorrect during the + * iteration and must not be accessed. + */ +static inline struct pt_range pt_all_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + /* + * Pretend the table is linear from 0 without a sign extension. This + * generates the correct indexes for iteration. + */ + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); + return range; +} + +/** + * pt_upper_range() - Return a range that spans part of the top level + * @common: Table + * + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the + * total page table. Otherwise it returns the entire page table. + */ +static inline struct pt_range pt_upper_range(struct pt_common *common) +{ + struct pt_range range = pt_top_range(common); + + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) + return range; + + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); + range.last_va = PT_VADDR_MAX; + return range; +} + +/** + * pt_make_range() - Return a range that spans part of the table + * @common: Table + * @va: Start address + * @last_va: Last address + * + * The caller must validate the range with pt_check_range() before using it. + */ +static __always_inline struct pt_range +pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) +{ + struct pt_range range = + _pt_top_range(common, READ_ONCE(common->top_of_table)); + + range.va = va; + range.last_va = last_va; + + return range; +} + +/* + * Span a slice of the table starting at a lower table level from an active + * walk. + */ +static __always_inline struct pt_range +pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, + pt_vaddr_t last_va) +{ + struct pt_range range = *parent; + + range.va = va; + range.last_va = last_va; + + PT_WARN_ON(last_va < va); + PT_WARN_ON(pt_check_range(&range)); + + return range; +} + +/** + * pt_init() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * @level: Table level for the state + * @table: Pointer to the table memory at level + * + * Helper to initialize the on-stack pt_state from walker arguments. + */ +static __always_inline struct pt_state +pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = { + .range = range, + .table = table, + .level = level, + }; + return pts; +} + +/** + * pt_init_top() - Initialize a pt_state on the stack + * @range: Range pointer to embed in the state + * + * The pt_state points to the top most level. + */ +static __always_inline struct pt_state pt_init_top(struct pt_range *range) +{ + return pt_init(range, range->top_level, range->top_table); +} + +typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table); + +/** + * pt_descend() - Recursively invoke the walker for the lower level + * @pts: Iteration State + * @arg: Value to pass to the function + * @fn: Walker function to call + * + * pts must point to a table item. Invoke fn as a walker on the table + * pts points to. + */ +static __always_inline int pt_descend(struct pt_state *pts, void *arg, + pt_level_fn_t fn) +{ + int ret; + + if (PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); + return ret; +} + +/** + * pt_walk_range() - Walk over a VA range + * @range: Range pointer + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * Walk over a VA range. The caller should have done a validity check, at + * least calling pt_check_range(), when building range. The walk will + * start at the top most table. + */ +static __always_inline int pt_walk_range(struct pt_range *range, + pt_level_fn_t fn, void *arg) +{ + return fn(range, arg, range->top_level, range->top_table); +} + +/* + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower + * level + * @pts: Iteration State + * @va: Start address + * @last_va: Last address + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over a slice of the + * lower table. The caller must ensure that va/last_va are within the table + * item. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int pt_walk_descend(const struct pt_state *pts, + pt_vaddr_t va, pt_vaddr_t last_va, + pt_level_fn_t fn, void *arg) +{ + struct pt_range range = pt_make_child_range(pts->range, va, last_va); + + if (PT_WARN_ON(!pt_can_have_table(pts)) || + PT_WARN_ON(!pts->table_lower)) + return -EINVAL; + + return fn(&range, arg, pts->level - 1, pts->table_lower); +} + +/* + * pt_walk_descend_all() - Recursively invoke the walker for a table item + * @parent_pts: Iteration State + * @fn: Walker function to call + * @arg: Value to pass to the function + * + * With pts pointing at a table item this will descend and over the entire lower + * table. This creates a new walk and does not alter pts or pts->range. + */ +static __always_inline int +pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, + void *arg) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); + + return pt_walk_descend(parent_pts, + log2_set_mod(parent_pts->range->va, 0, isz_lg2), + log2_set_mod_max(parent_pts->range->va, isz_lg2), + fn, arg); +} + +/** + * pt_range_slice() - Return a range that spans indexes + * @pts: Iteration State + * @start_index: Starting index within pts + * @end_index: Ending index within pts + * + * Create a range than spans an index range of the current table level + * pt_state points at. + */ +static inline struct pt_range pt_range_slice(const struct pt_state *pts, + unsigned int start_index, + unsigned int end_index) +{ + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); + pt_vaddr_t last_va; + pt_vaddr_t va; + + va = fvalog2_set_mod(pts->range->va, + log2_mul(start_index, pt_table_item_lg2sz(pts)), + table_lg2sz); + last_va = fvalog2_set_mod( + pts->range->va, + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); + return pt_make_child_range(pts->range, va, last_va); +} + +/** + * pt_top_memsize_lg2() + * @common: Table + * @top_of_table: Top of table value from _pt_top_set() + * + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this + * will compute the top size assuming the table will grow. + */ +static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, + uintptr_t top_of_table) +{ + struct pt_range range = _pt_top_range(common, top_of_table); + struct pt_state pts = pt_init_top(&range); + unsigned int num_items_lg2; + + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); + if (range.top_level != PT_MAX_TOP_LEVEL && + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); + + /* Round up the allocation size to the minimum alignment */ + return max(ffs_t(u64, PT_TOP_PHYS_MASK), + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); +} + +/** + * pt_compute_best_pgsize() - Determine the best page size for leaf entries + * @pgsz_bitmap: Permitted page sizes + * @va: Starting virtual address for the leaf entry + * @last_va: Last virtual address for the leaf entry, sets the max page size + * @oa: Starting output address for the leaf entry + * + * Compute the largest page size for va, last_va, and oa together and return it + * in lg2. The largest page size depends on the format's supported page sizes at + * this level, and the relative alignment of the VA and OA addresses. 0 means + * the OA cannot be stored with the provided pgsz_bitmap. + */ +static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, + pt_vaddr_t va, + pt_vaddr_t last_va, + pt_oaddr_t oa) +{ + unsigned int best_pgsz_lg2; + unsigned int pgsz_lg2; + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t mask; + + if (PT_WARN_ON(va >= last_va)) + return 0; + + /* + * Given a VA/OA pair the best page size is the largest page size + * where: + * + * 1) VA and OA start at the page. Bitwise this is the count of least + * significant 0 bits. + * This also implies that last_va/oa has the same prefix as va/oa. + */ + mask = va | oa; + + /* + * 2) The page size is not larger than the last_va (length). Since page + * sizes are always power of two this can't be larger than the + * largest power of two factor of the length. + */ + mask |= log2_to_int(vafls(len) - 1); + + best_pgsz_lg2 = vaffs(mask); + + /* Choose the highest bit <= best_pgsz_lg2 */ + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); + + pgsz_lg2 = vafls(pgsz_bitmap); + if (!pgsz_lg2) + return 0; + + pgsz_lg2--; + + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + PT_WARN_ON( + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); + return pgsz_lg2; +} + +#define _PT_MAKE_CALL_LEVEL(fn) \ + static __always_inline int fn(struct pt_range *range, void *arg, \ + unsigned int level, \ + struct pt_table_p *table) \ + { \ + static_assert(PT_MAX_TOP_LEVEL <= 5); \ + if (level == 0) \ + return CONCATENATE(fn, 0)(range, arg, 0, table); \ + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ + return CONCATENATE(fn, 1)(range, arg, 1, table); \ + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ + return CONCATENATE(fn, 2)(range, arg, 2, table); \ + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ + return CONCATENATE(fn, 3)(range, arg, 3, table); \ + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ + return CONCATENATE(fn, 4)(range, arg, 4, table); \ + return CONCATENATE(fn, 5)(range, arg, 5, table); \ + } + +static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, + unsigned int unused_level, + struct pt_table_p *table) +{ + static_assert(PT_MAX_TOP_LEVEL <= 5); + return -EPROTOTYPE; +} + +#define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ + static inline int fn(struct pt_range *range, void *arg, \ + unsigned int unused_level, \ + struct pt_table_p *table) \ + { \ + return do_fn(range, arg, level, table, descend_fn); \ + } + +/** + * PT_MAKE_LEVELS() - Build an unwound walker + * @fn: Name of the walker function + * @do_fn: Function to call at each level + * + * This builds a function call tree that can be fully inlined. + * The caller must provide a function body in an __always_inline function:: + * + * static __always_inline int do(struct pt_range *range, void *arg, + * unsigned int level, struct pt_table_p *table, + * pt_level_fn_t descend_fn) + * + * An inline function will be created for each table level that calls do_fn with + * a compile time constant for level and a pointer to the next lower function. + * This generates an optimally inlined walk where each of the functions sees a + * constant level and can codegen the exact constants/etc for that level. + * + * Note this can produce a lot of code! + */ +#define PT_MAKE_LEVELS(fn, do_fn) \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ + do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ + _PT_MAKE_CALL_LEVEL(fn) + +#endif diff --git a/drivers/iommu/generic_pt/pt_log2.h b/drivers/iommu/generic_pt/pt_log2.h new file mode 100644 index 000000000000..6dbbed119238 --- /dev/null +++ b/drivers/iommu/generic_pt/pt_log2.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Helper macros for working with log2 values + * + */ +#ifndef __GENERIC_PT_LOG2_H +#define __GENERIC_PT_LOG2_H +#include +#include + +/* Compute a */ +#define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) +static_assert(log2_to_int_t(unsigned int, 0) == 1); + +/* Compute a - 1 (aka all low bits set) */ +#define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) + +/* Compute a / b */ +#define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) +static_assert(log2_div_t(unsigned int, 4, 2) == 1); + +/* + * Compute: + * a / c == b / c + * aka the high bits are equal + */ +#define log2_div_eq_t(type, a, b, c_lg2) \ + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) +static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); + +/* Compute a % b */ +#define log2_mod_t(type, a, b_lg2) \ + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) +static_assert(log2_mod_t(unsigned int, 1, 2) == 1); + +/* + * Compute: + * a % b == b - 1 + * aka the low bits are all 1s + */ +#define log2_mod_eq_max_t(type, a, b_lg2) \ + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) +static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); + +/* + * Return a value such that: + * a / b == ret / b + * ret % b == val + * aka set the low bits to val. val must be < b + */ +#define log2_set_mod_t(type, a, val, b_lg2) \ + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) +static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); + +/* Return a value such that: + * a / b == ret / b + * ret % b == b - 1 + * aka set the low bits to all 1s + */ +#define log2_set_mod_max_t(type, a, b_lg2) \ + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) +static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); + +/* Compute a * b */ +#define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) +static_assert(log2_mul_t(unsigned int, 2, 2) == 8); + +#define _dispatch_sz(type, fn, a) \ + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) + +/* + * Return the highest value such that: + * fls_t(u32, 0) == 0 + * fls_t(u3, 1) == 1 + * a >= log2_to_int(ret - 1) + * aka find last set bit + */ +static inline unsigned int fls32(u32 a) +{ + return fls(a); +} +#define fls_t(type, a) _dispatch_sz(type, fls, a) + +/* + * Return the highest value such that: + * ffs_t(u32, 0) == UNDEFINED + * ffs_t(u32, 1) == 0 + * log_mod(a, ret) == 0 + * aka find first set bit + */ +static inline unsigned int __ffs32(u32 a) +{ + return __ffs(a); +} +#define ffs_t(type, a) _dispatch_sz(type, __ffs, a) + +/* + * Return the highest value such that: + * ffz_t(u32, U32_MAX) == UNDEFINED + * ffz_t(u32, 0) == 0 + * ffz_t(u32, 1) == 1 + * log_mod(a, ret) == log_to_max_int(ret) + * aka find first zero bit + */ +static inline unsigned int ffz32(u32 a) +{ + return ffz(a); +} +static inline unsigned int ffz64(u64 a) +{ + if (sizeof(u64) == sizeof(unsigned long)) + return ffz(a); + + if ((u32)a == U32_MAX) + return ffz32(a >> 32) + 32; + return ffz32(a); +} +#define ffz_t(type, a) _dispatch_sz(type, ffz, a) + +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h new file mode 100644 index 000000000000..e69a75511313 --- /dev/null +++ b/include/linux/generic_pt/common.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_COMMON_H +#define __GENERIC_PT_COMMON_H + +#include +#include +#include + +/** + * DOC: Generic Radix Page Table + * + * Generic Radix Page Table is a set of functions and helpers to efficiently + * parse radix style page tables typically seen in HW implementations. The + * interface is built to deliver similar code generation as the mm's pte/pmd/etc + * system by fully inlining the exact code required to handle each table level. + * + * Like the mm subsystem each format contributes its parsing implementation + * under common names and the common code implements the required algorithms. + * + * The system is divided into three logical levels: + * + * - The page table format and its manipulation functions + * - Generic helpers to give a consistent API regardless of underlying format + * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM) + * + * Multiple implementations are supported. The intention is to have the generic + * format code be re-usable for whatever specialized implementation is required. + * The generic code is solely about the format of the radix tree; it does not + * include memory allocation or higher level decisions that are left for the + * implementation. + * + * The generic framework supports a superset of functions across many HW + * implementations: + * + * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes + * - Multi-level tables, up to 6 levels. Runtime selected top level + * - Runtime variable table level size (ARM's concatenated tables) + * - Expandable top level allowing dynamic sizing of table levels + * - Optional leaf entries at any level + * - 32-bit/64-bit virtual and output addresses, using every address bit + * - Dirty tracking + * - Sign extended addressing + */ + +/** + * struct pt_common - struct for all page table implementations + */ +struct pt_common { + /** + * @top_of_table: Encodes the table top pointer and the top level in a + * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower + * bits of the aligned table pointer are used for the level. + */ + uintptr_t top_of_table; + /** + * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits + * must be zero. This may be less than what the page table format + * supports, but must not be more. + */ + u8 max_oasz_lg2; + /** + * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits + * are 0 or 1 depending on pt_full_va_prefix(). This may be less than + * what the page table format supports, but must not be more. When + * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability. + */ + u8 max_vasz_lg2; + /** + * @features: Bitmap of `enum pt_features` + */ + unsigned int features; +}; + +/* Encoding parameters for top_of_table */ +enum { + PT_TOP_LEVEL_BITS = 3, + PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), +}; + +/** + * enum pt_features - Features turned on in the table. Each symbol is a bit + * position. + */ +enum pt_features { + /** + * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to + * PT_VADDR_MAX. + */ + PT_FEAT_FULL_VA, + /** + * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased + * dynamically during map. This requires HW support for atomically + * setting both the table top pointer and the starting table level. + */ + PT_FEAT_DYNAMIC_TOP, + /** + * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign + * extends up to the full pt_vaddr_t. This divides the page table into + * three VA ranges:: + * + * 0 -> 2^N - 1 Lower + * 2^N -> (MAX - 2^N - 1) Non-Canonical + * MAX - 2^N -> MAX Upper + * + * In this mode pt_common::max_vasz_lg2 includes the sign bit and the + * upper bits that don't fall within the translation are just validated. + * + * If not set there is no sign extension and valid VA goes from 0 to 2^N + * - 1. + */ + PT_FEAT_SIGN_EXTEND, + /** + * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA + * ranges which will clean out any walk cache or any IOPTE fully + * contained by the range. The optimization objective is to minimize the + * number of flushes even if ranges include IOVA gaps that do not need + * to be flushed. + */ + PT_FEAT_FLUSH_RANGE, + /** + * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that + * the optimization objective is to only flush IOVA that has been + * changed. This mode is suitable for cases like hypervisor shadowing + * where flushing unchanged ranges may cause the hypervisor to reparse + * significant amount of page table. + */ + PT_FEAT_FLUSH_RANGE_NO_GAPS, + /* private: */ + PT_FEAT_FMT_START, +}; + +#endif -- cgit v1.2.3 From cdb39d9185795b744dab4d4d782f2fe3f5eca10c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:01 -0400 Subject: iommupt: Add the basic structure of the iommu implementation The existing IOMMU page table implementations duplicate all of the working algorithms for each format. By using the generic page table API a single C version of the IOMMU algorithms can be created and re-used for all of the different formats used in the drivers. The implementation will provide a single C version of the iommu domain operations: iova_to_phys, map, unmap, and read_and_clear_dirty. Further, adding new algorithms and techniques becomes easy to do across the entire fleet of drivers and formats. The C functions are drop in compatible with the existing iommu_domain_ops using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation compilation unit will produce exported symbols following the pattern pt_iommu_FMT_map_pages() which the macro directly maps to the iommu_domain_ops members. This avoids the additional function pointer indirection like io-pgtable has. The top level struct used by the drivers is pt_iommu_table_FMT. It contains the other structs to allow container_of() to move between the driver, iommu page table, generic page table, and generic format layers. struct pt_iommu_table_amdv1 { struct pt_iommu { struct iommu_domain domain; } iommu; struct pt_amdv1 { struct pt_common common; } amdpt; }; The driver is expected to union the pt_iommu_table_FMT with its own existing domain struct: struct driver_domain { union { struct iommu_domain domain; struct pt_iommu_table_amdv1 amdv1; }; }; PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain); To create an alias to avoid renaming 'domain' in a lot of driver code. This allows all the layers to access all the necessary functions to implement their different roles with no change to any of the existing iommu core code. Implement the basic starting point: pt_iommu_init(), get_info() and deinit(). Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/Kconfig | 13 ++ drivers/iommu/generic_pt/fmt/iommu_template.h | 39 ++++ drivers/iommu/generic_pt/iommu_pt.h | 259 ++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 150 +++++++++++++++ 4 files changed, 461 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h create mode 100644 drivers/iommu/generic_pt/iommu_pt.h create mode 100644 include/linux/generic_pt/iommu.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index fb0f431ddba0..a81dfdd72ca0 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -17,4 +17,17 @@ config DEBUG_GENERIC_PT kernels. The kunit tests require this to be enabled to get full coverage. + +config IOMMU_PT + tristate "IOMMU Page Tables" + select IOMMU_API + depends on IOMMU_SUPPORT + depends on GENERIC_PT + help + Generic library for building IOMMU page tables + + IOMMU_PT provides an implementation of the page table operations + related to struct iommu_domain using GENERIC_PT. It provides a single + implementation of the page table operations that can be shared by + multiple drivers. endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h new file mode 100644 index 000000000000..5b631bc07cbc --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * Template to build the iommu module and kunit from the format and + * implementation headers. + * + * The format should have: + * #define PT_FMT + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) + * And optionally: + * #define PT_FORCE_ENABLED_FEATURES .. + * #define PT_FMT_VARIANT + */ +#include +#include + +#ifdef PT_FMT_VARIANT +#define PTPFX_RAW \ + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) +#else +#define PTPFX_RAW PT_FMT +#endif + +#define PTPFX CONCATENATE(PTPFX_RAW, _) + +#define _PT_FMT_H PT_FMT.h +#define PT_FMT_H __stringify(_PT_FMT_H) + +#define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) +#define PT_DEFS_H __stringify(_PT_DEFS_H) + +#include +#include PT_DEFS_H +#include "../pt_defs.h" +#include PT_FMT_H +#include "../pt_common.h" + +#include "../iommu_pt.h" diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h new file mode 100644 index 000000000000..564f2d3a6e11 --- /dev/null +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * "Templated C code" for implementing the iommu operations for page tables. + * This is compiled multiple times, over all the page table formats to pick up + * the per-format definitions. + */ +#ifndef __GENERIC_PT_IOMMU_PT_H +#define __GENERIC_PT_IOMMU_PT_H + +#include "pt_iter.h" + +#include +#include +#include "../iommu-pages.h" + +#define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) + +struct pt_iommu_collect_args { + struct iommu_pages_list free_list; +}; + +static int __collect_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_collect_args *collect = arg; + int ret; + + if (!pt_can_have_table(&pts)) + return 0; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_pages_list_add(&collect->free_list, pts.table_lower); + ret = pt_descend(&pts, arg, __collect_tables); + if (ret) + return ret; + continue; + } + } + return 0; +} + +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp) +{ + struct pt_iommu *iommu_table = iommu_from_common(common); + + /* + * Top doesn't need the free list or otherwise, so it technically + * doesn't need to use iommu pages. Use the API anyhow as the top is + * usually not smaller than PAGE_SIZE to keep things simple. + */ + return iommu_alloc_pages_node_sz( + iommu_table->nid, gfp, + log2_to_int(pt_top_memsize_lg2(common, top_of_table))); +} + +static void NS(get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_top_range(common); + struct pt_state pts = pt_init_top(&range); + pt_vaddr_t pgsize_bitmap = 0; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; + pts.level++) { + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) + break; + pgsize_bitmap |= pt_possible_sizes(&pts); + } + } else { + for (pts.level = 0; pts.level <= range.top_level; pts.level++) + pgsize_bitmap |= pt_possible_sizes(&pts); + } + + /* Hide page sizes larger than the maximum OA */ + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); +} + +static void NS(deinit)(struct pt_iommu *iommu_table) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + /* + * The driver has to already have fenced the HW access to the page table + * and invalidated any caching referring to this memory. + */ + iommu_put_pages_list(&collect.free_list); +} + +static const struct pt_iommu_ops NS(ops) = { + .get_info = NS(get_info), + .deinit = NS(deinit), +}; + +static int pt_init_common(struct pt_common *common) +{ + struct pt_range top_range = pt_top_range(common); + + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) + return -EINVAL; + + if (top_range.top_level == PT_MAX_TOP_LEVEL || + common->max_vasz_lg2 == top_range.max_vasz_lg2) + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); + + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) + common->features |= BIT(PT_FEAT_FULL_VA); + + /* Requested features must match features compiled into this format */ + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && + (common->features & PT_FORCE_ENABLED_FEATURES) != + PT_FORCE_ENABLED_FEATURES)) + return -EOPNOTSUPP; + + if (common->max_oasz_lg2 == 0) + common->max_oasz_lg2 = pt_max_oa_lg2(common); + else + common->max_oasz_lg2 = min(common->max_oasz_lg2, + pt_max_oa_lg2(common)); + return 0; +} + +static int pt_iommu_init_domain(struct pt_iommu *iommu_table, + struct iommu_domain *domain) +{ + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_iommu_info info; + struct pt_range range; + + NS(get_info)(iommu_table, &info); + + domain->type = __IOMMU_DOMAIN_PAGING; + domain->pgsize_bitmap = info.pgsize_bitmap; + + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + range = _pt_top_range(common, + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); + else + range = pt_top_range(common); + + /* A 64-bit high address space table on a 32-bit system cannot work. */ + domain->geometry.aperture_start = (unsigned long)range.va; + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) + return -EOVERFLOW; + + /* + * The aperture is limited to what the API can do after considering all + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used + * to store a VA. Set the aperture to something that is valid for all + * cases. Saturate instead of truncate the end if the types are smaller + * than the top range. aperture_end should be called aperture_last. + */ + domain->geometry.aperture_end = (unsigned long)range.last_va; + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { + domain->geometry.aperture_end = ULONG_MAX; + domain->pgsize_bitmap &= ULONG_MAX; + } + domain->geometry.force_aperture = true; + + return 0; +} + +static void pt_iommu_zero(struct pt_iommu_table *fmt_table) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_iommu cfg = *iommu_table; + + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); + memset_after(fmt_table, 0, iommu.domain); + + /* The caller can initialize some of these values */ + iommu_table->nid = cfg.nid; +} + +#define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) +#define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) + +int pt_iommu_init(struct pt_iommu_table *fmt_table, + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_table_p *table_mem; + int ret; + + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) + return -EINVAL; + + pt_iommu_zero(fmt_table); + common->features = cfg->common.features; + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; + ret = pt_iommu_fmt_init(fmt_table, cfg); + if (ret) + return ret; + + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) + return -EINVAL; + + ret = pt_init_common(common); + if (ret) + return ret; + + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && + (pt_feature(common, PT_FEAT_FULL_VA) || + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) + return -EINVAL; + + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); + if (ret) + return ret; + + table_mem = table_alloc_top(common, common->top_of_table, gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_top_set(common, table_mem, pt_top_get_level(common)); + + /* Must be last, see pt_iommu_deinit() */ + iommu_table->ops = &NS(ops); + return 0; +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); + +#ifdef pt_iommu_fmt_hw_info +#define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) +#define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) +void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, + struct pt_iommu_table_hw_info *info) +{ + struct pt_iommu *iommu_table = &fmt_table->iommu; + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range top_range = pt_top_range(common); + + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); +} +EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); +#endif + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); +MODULE_IMPORT_NS("GENERIC_PT"); + +#endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h new file mode 100644 index 000000000000..defa96abc497 --- /dev/null +++ b/include/linux/generic_pt/iommu.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_IOMMU_H +#define __GENERIC_PT_IOMMU_H + +#include +#include +#include + +struct pt_iommu_ops; + +/** + * DOC: IOMMU Radix Page Table + * + * The IOMMU implementation of the Generic Page Table provides an ops struct + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and + * the generic map/unmap interface. + * + * This interface uses a caller provided locking approach. The caller must have + * a VA range lock concept that prevents concurrent threads from calling ops on + * the same VA. Generally the range lock must be at least as large as a single + * map call. + */ + +/** + * struct pt_iommu - Base structure for IOMMU page tables + * + * The format-specific struct will include this as the first member. + */ +struct pt_iommu { + /** + * @domain: The core IOMMU domain. The driver should use a union to + * overlay this memory with its previously existing domain struct to + * create an alias. + */ + struct iommu_domain domain; + + /** + * @ops: Function pointers to access the API + */ + const struct pt_iommu_ops *ops; + + /** + * @nid: Node ID to use for table memory allocations. The IOMMU driver + * may want to set the NID to the device's NID, if there are multiple + * table walkers. + */ + int nid; +}; + +/** + * struct pt_iommu_info - Details about the IOMMU page table + * + * Returned from pt_iommu_ops->get_info() + */ +struct pt_iommu_info { + /** + * @pgsize_bitmap: A bitmask where each set bit indicates + * a page size that can be natively stored in the page table. + */ + u64 pgsize_bitmap; +}; + +struct pt_iommu_ops { + /** + * @get_info: Return the pt_iommu_info structure + * @iommu_table: Table to query + * + * Return some basic static information about the page table. + */ + void (*get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info); + + /** + * @deinit: Undo a format specific init operation + * @iommu_table: Table to destroy + * + * Release all of the memory. The caller must have already removed the + * table from all HW access and all caches. + */ + void (*deinit)(struct pt_iommu *iommu_table); +}; + +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) +{ + /* + * It is safe to call pt_iommu_deinit() before an init, or if init + * fails. The ops pointer will only become non-NULL if deinit needs to be + * run. + */ + if (iommu_table->ops) + iommu_table->ops->deinit(iommu_table); +} + +/** + * struct pt_iommu_cfg - Common configuration values for all formats + */ +struct pt_iommu_cfg { + /** + * @features: Features required. Only these features will be turned on. + * The feature list should reflect what the IOMMU HW is capable of. + */ + unsigned int features; + /** + * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will + * imply the top level of the table. + */ + u8 hw_max_vasz_lg2; + /** + * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format + * might select a lower maximum OA. + */ + u8 hw_max_oasz_lg2; +}; + +/* Generate the exported function signatures from iommu_pt.h */ +#define IOMMU_PROTOTYPES(fmt) \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ + struct pt_iommu_##fmt##_hw_info *info) +#define IOMMU_FORMAT(fmt, member) \ + struct pt_iommu_##fmt { \ + struct pt_iommu iommu; \ + struct pt_##fmt member; \ + }; \ + IOMMU_PROTOTYPES(fmt) + +/* + * The driver should setup its domain struct like + * union { + * struct iommu_domain domain; + * struct pt_iommu_xxx xx; + * }; + * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain); + * + * Which creates an alias between driver_domain.domain and + * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing + * driver_domain.domain users. + */ +#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \ + static_assert(offsetof(s, pt_iommu_memb.domain) == \ + offsetof(s, domain_memb)) + +#undef IOMMU_PROTOTYPES +#undef IOMMU_FORMAT +#endif -- cgit v1.2.3 From 879ced2bab1ba95e98fac56c9503791183bc7cbb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:02 -0400 Subject: iommupt: Add the AMD IOMMU v1 page table format AMD IOMMU v1 is unique in supporting contiguous pages with a variable size and it can decode the full 64 bit VA space. Unlike other x86 page tables this explicitly does not do sign extension as part of allowing the entire 64 bit VA space to be supported. The general design is quite similar to the x86 PAE format, except with a 6th level and quite different PTE encoding. This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in the existing code as the existing AMDv1 code starts out with a 3 level table and adds levels on the fly if more IOVA is needed. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 65,64 , 62,61 , -1.01 2^13, 70,66 , 67,62 , -8.08 2^14, 73,69 , 71,65 , -9.09 2^15, 78,75 , 75,71 , -5.05 2^16, 89,89 , 86,84 , -2.02 2^17, 128,121 , 124,112 , -10.10 2^18, 175,175 , 170,163 , -4.04 2^19, 264,306 , 261,279 , 6.06 2^20, 444,525 , 438,489 , 10.10 2^21, 60,62 , 58,59 , 1.01 256*2^12, 381,1833 , 367,1795 , 79.79 256*2^21, 375,1623 , 356,1555 , 77.77 256*2^30, 356,1338 , 349,1277 , 72.72 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 76,89 , 71,86 , 17.17 2^13, 79,89 , 75,86 , 12.12 2^14, 78,90 , 74,86 , 13.13 2^15, 82,89 , 74,86 , 13.13 2^16, 79,89 , 74,86 , 13.13 2^17, 81,89 , 77,87 , 11.11 2^18, 90,92 , 87,89 , 2.02 2^19, 91,93 , 88,90 , 2.02 2^20, 96,95 , 91,92 , 1.01 2^21, 72,88 , 68,85 , 20.20 256*2^12, 372,6583 , 364,6251 , 94.94 256*2^21, 398,6032 , 392,5758 , 93.93 256*2^30, 396,5665 , 389,5258 , 92.92 The ~5-17x speedup when working with mutli-PTE map/unmaps is because the AMD implementation rewalks the entire table on every new PTE while this version retains its position. The same speedup will be seen with dirtys as well. The old implementation triggers a compiler optimization that ends up generating a "rep stos" memset for contiguous PTEs. Since AMD can have contiguous PTEs that span 2Kbytes of table this is a huge win compared to a normal movq loop. It is why the unmap side has a fairly flat runtime as the contiguous PTE sides increases. This version makes it explicit with a memset64() call. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/Makefile | 1 + drivers/iommu/generic_pt/Kconfig | 12 + drivers/iommu/generic_pt/fmt/Makefile | 11 + drivers/iommu/generic_pt/fmt/amdv1.h | 387 +++++++++++++++++++++++++++++ drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 ++ drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 ++ include/linux/generic_pt/common.h | 19 ++ include/linux/generic_pt/iommu.h | 12 + 8 files changed, 478 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/Makefile create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c (limited to 'include/linux') diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 355294fa9033..b17ef9818759 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -3,6 +3,7 @@ obj-y += arm/ iommufd/ obj-$(CONFIG_AMD_IOMMU) += amd/ obj-$(CONFIG_INTEL_IOMMU) += intel/ obj-$(CONFIG_RISCV_IOMMU) += riscv/ +obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index a81dfdd72ca0..cbdad222923b 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -30,4 +30,16 @@ config IOMMU_PT related to struct iommu_domain using GENERIC_PT. It provides a single implementation of the page table operations that can be shared by multiple drivers. + +if IOMMU_PT +config IOMMU_PT_AMDV1 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the AMD v1 page table. AMDv1 is the + "host" page table. It supports granular page sizes of almost every + power of 2 and decodes an full 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. +endif endif diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile new file mode 100644 index 000000000000..a4d83b7e0cf6 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 + +define create_format +obj-$(2) += iommu_$(1).o + +endef + +$(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) +$(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h new file mode 100644 index 000000000000..7423ed71417d --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * AMD IOMMU v1 page table + * + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" + * + * Note the level numbering here matches the core code, so level 0 is the same + * as mode 1. + * + */ +#ifndef __GENERIC_PT_FMT_AMDV1_H +#define __GENERIC_PT_FMT_AMDV1_H + +#include "defs_amdv1.h" +#include "../pt_defs.h" + +#include +#include +#include +#include +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 5, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* The DTE only has these bits for the top phyiscal address */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* PTE bits */ +enum { + AMDV1PT_FMT_PR = BIT(0), + AMDV1PT_FMT_D = BIT(6), + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), + AMDV1PT_FMT_FC = BIT_ULL(60), + AMDV1PT_FMT_IR = BIT_ULL(61), + AMDV1PT_FMT_IW = BIT_ULL(62), +}; + +/* + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make + * these defines to avoid it. + */ +#define AMDV1PT_FMT_NL_DEFAULT 0 +#define AMDV1PT_FMT_NL_SIZE 7 + +static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa amdv1pt_table_pa + +/* Returns the oa for the start of the contiguous entry */ +static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + pt_oaddr_t oa; + + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { + unsigned int sz_bits = oaffz(oa); + + oa = oalog2_set_mod(oa, 0, sz_bits); + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != + AMDV1PT_FMT_NL_DEFAULT)) + return 0; + return oalog2_mul(oa, PT_GRANULE_LG2SZ); +} +#define pt_entry_oa amdv1pt_entry_oa + +static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) +{ + /* + * Table 15: Page Table Level Parameters + * The top most level cannot have translation entries + */ + return pts->level < PT_MAX_TOP_LEVEL; +} +#define pt_can_have_leaf amdv1pt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) +{ + u32 code; + + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == + AMDV1PT_FMT_NL_DEFAULT) + return ilog2(1); + + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != + AMDV1PT_FMT_NL_SIZE); + + /* + * The contiguous size is encoded in the length of a string of 1's in + * the low bits of the OA. Reverse the equation: + * code = log2_to_int(num_contig_lg2 + item_lg2sz - + * PT_GRANULE_LG2SZ - 1) - 1 + * Which can be expressed as: + * num_contig_lg2 = oalog2_ffz(code) + 1 - + * item_lg2sz - PT_GRANULE_LG2SZ + * + * Assume the bit layout is correct and remove the masking. Reorganize + * the equation to move all the arithmetic before the ffz. + */ + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); + return ffz_t(u32, code); +} +#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 + +static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) +{ + /* + * Top entry covers bits [63:57] only, this is handled through + * max_vasz_lg2. + */ + if (PT_WARN_ON(pts->level == 5)) + return 7; + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 amdv1pt_num_items_lg2 + +static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + + if (!amdv1pt_can_have_leaf(pts)) + return 0; + + /* + * Table 14: Example Page Size Encodings + * Address bits 51:32 can be used to encode page sizes greater than 4 + * Gbytes. Address bits 63:52 are zero-extended. + * + * 512GB Pages are not supported due to a hardware bug. + * Otherwise every power of two size is supported. + */ + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), + isz_lg2) & ~SZ_512G; +} +#define pt_possible_sizes amdv1pt_possible_sizes + +static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; + unsigned int next_level; + u64 entry; + + pts->entry = entry = READ_ONCE(*tablep); + if (!(entry & AMDV1PT_FMT_PR)) + return PT_ENTRY_EMPTY; + + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || + next_level == AMDV1PT_FMT_NL_SIZE) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw amdv1pt_load_entry_raw + +static inline void +amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (oasz_lg2 == isz_lg2) { + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_DEFAULT); + WRITE_ONCE(*tablep, entry); + } else { + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, + AMDV1PT_FMT_NL_SIZE) | + FIELD_PREP(AMDV1PT_FMT_OA, + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - + 1) - + 1); + + /* See amdv1pt_clear_entries() */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + memset64(tablep, entry, log2_to_int(num_contig_lg2)); + } + } + pts->entry = entry; +} +#define pt_install_leaf_entry amdv1pt_install_leaf_entry + +static inline bool amdv1pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + /* + * IR and IW are ANDed from the table levels along with the PTE. We + * always control permissions from the PTE, so always set IR and IW for + * tables. + */ + entry = AMDV1PT_FMT_PR | + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | + FIELD_PREP(AMDV1PT_FMT_OA, + log2_div(table_pa, PT_GRANULE_LG2SZ)) | + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table amdv1pt_install_table + +static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); +} +#define pt_attr_from_entry amdv1pt_attr_from_entry + +static inline void amdv1pt_clear_entries(struct pt_state *pts, + unsigned int num_contig_lg2) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 *end = tablep + log2_to_int(num_contig_lg2); + + /* + * gcc generates rep stos for the io-pgtable code, and this difference + * can show in microbenchmarks with larger contiguous page sizes. + * rep is slower for small cases. + */ + if (num_contig_lg2 <= ilog2(32)) { + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, 0); + } else { + memset64(tablep, 0, log2_to_int(num_contig_lg2)); + } +} +#define pt_clear_entries amdv1pt_clear_entries + +static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) + return true; + return false; +} +#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty + +static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) +{ + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); + u64 *tablep = pt_cur_table(pts, u64) + + log2_set_mod(pts->index, 0, num_contig_lg2); + u64 *end = tablep + log2_to_int(num_contig_lg2); + + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); +} +#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean + +static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | AMDV1PT_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_amdv1 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) + ->amdpt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; +} + +static inline int amdv1pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) + pte |= AMDV1PT_FMT_FC; + if (iommu_prot & IOMMU_READ) + pte |= AMDV1PT_FMT_IR; + if (iommu_prot & IOMMU_WRITE) + pte |= AMDV1PT_FMT_IW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot amdv1pt_iommu_set_prot + +static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, + const struct pt_iommu_amdv1_cfg *cfg) +{ + struct pt_amdv1 *table = &iommu_table->amdpt; + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; + + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) + return -EINVAL; + + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && + cfg->starting_level != PT_MAX_TOP_LEVEL) + max_vasz_lg2 = PT_GRANULE_LG2SZ + + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * + (cfg->starting_level + 1); + + table->common.max_vasz_lg2 = + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + pt_top_set_level(&table->common, cfg->starting_level); + return 0; +} +#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init + +static inline void +amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, + const struct pt_range *top_range, + struct pt_iommu_amdv1_hw_info *info) +{ + info->host_pt_root = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); + info->mode = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif diff --git a/drivers/iommu/generic_pt/fmt/defs_amdv1.h b/drivers/iommu/generic_pt/fmt/defs_amdv1.h new file mode 100644 index 000000000000..0b9614ca6d10 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_amdv1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H +#define __GENERIC_PT_FMT_DEFS_AMDV1_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct amdv1pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs amdv1pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_amdv1.c b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c new file mode 100644 index 000000000000..72a2337d0c55 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_amdv1.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT amdv1 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) +#define PT_FORCE_ENABLED_FEATURES \ + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) + +#include "iommu_template.h" diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index e69a75511313..21e33489cbf2 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -132,4 +132,23 @@ enum pt_features { PT_FEAT_FMT_START, }; +struct pt_amdv1 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START, + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_AMDV1_FORCE_COHERENCE, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index defa96abc497..dc731fe003d1 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -145,6 +145,18 @@ struct pt_iommu_cfg { static_assert(offsetof(s, pt_iommu_memb.domain) == \ offsetof(s, domain_memb)) +struct pt_iommu_amdv1_cfg { + struct pt_iommu_cfg common; + unsigned int starting_level; +}; + +struct pt_iommu_amdv1_hw_info { + u64 host_pt_root; + u8 mode; +}; + +IOMMU_FORMAT(amdv1, amdpt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 9d4c274cd7d5e1b6b9e116e155f16bcd208237d8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:03 -0400 Subject: iommupt: Add iova_to_phys op iova_to_phys is a performance path for the DMA API and iommufd, implement it using an unrolled get_user_pages() like function waterfall scheme. The implementation itself is fairly trivial. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 105 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 19 +++++-- 2 files changed, 119 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 564f2d3a6e11..5ff1b887928a 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -17,6 +17,111 @@ #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) +static int make_range_ul(struct pt_common *common, struct pt_range *range, + unsigned long iova, unsigned long len) +{ + unsigned long last; + + if (unlikely(len == 0)) + return -EINVAL; + + if (check_add_overflow(iova, len - 1, &last)) + return -EOVERFLOW; + + *range = pt_make_range(common, iova, last); + if (sizeof(iova) > sizeof(range->va)) { + if (unlikely(range->va != iova || range->last_va != last)) + return -EOVERFLOW; + } + return 0; +} + +static __maybe_unused int make_range_u64(struct pt_common *common, + struct pt_range *range, u64 iova, + u64 len) +{ + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) + return -EOVERFLOW; + return make_range_ul(common, range, iova, len); +} + +/* + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch + * to the correct validation based on the type. + */ +#define make_range_no_check(common, range, iova, len) \ + ({ \ + int ret; \ + if (sizeof(iova) > sizeof(unsigned long) || \ + sizeof(len) > sizeof(unsigned long)) \ + ret = make_range_u64(common, range, iova, len); \ + else \ + ret = make_range_ul(common, range, iova, len); \ + ret; \ + }) + +#define make_range(common, range, iova, len) \ + ({ \ + int ret = make_range_no_check(common, range, iova, len); \ + if (!ret) \ + ret = pt_check_range(range); \ + ret; \ + }) + +static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + pt_oaddr_t *res = arg; + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, descend_fn); + case PT_ENTRY_OA: + *res = pt_entry_oa_exact(&pts); + return 0; + } + return -ENOENT; +} +PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); + +/** + * iova_to_phys() - Return the output address for the given IOVA + * @iommu_table: Table to query + * @iova: IO virtual address to query + * + * Determine the output address from the given IOVA. @iova may have any + * alignment, the returned physical will be adjusted with any sub page offset. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Return: 0 if there is no translation for the given iova. + */ +phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_range range; + pt_oaddr_t res; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + ret = pt_walk_range(&range, __iova_to_phys, &res); + /* PHYS_ADDR_MAX would be a better error code */ + if (ret) + return 0; + return res; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); + struct pt_iommu_collect_args { struct iommu_pages_list free_list; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index dc731fe003d1..5622856e1998 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -116,11 +116,13 @@ struct pt_iommu_cfg { }; /* Generate the exported function signatures from iommu_pt.h */ -#define IOMMU_PROTOTYPES(fmt) \ - int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ - const struct pt_iommu_##fmt##_cfg *cfg, \ - gfp_t gfp); \ - void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ +#define IOMMU_PROTOTYPES(fmt) \ + phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ + dma_addr_t iova); \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ struct pt_iommu_##fmt##_hw_info *info) #define IOMMU_FORMAT(fmt, member) \ struct pt_iommu_##fmt { \ @@ -129,6 +131,13 @@ struct pt_iommu_cfg { }; \ IOMMU_PROTOTYPES(fmt) +/* + * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the + * iommu_pt + */ +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, + /* * The driver should setup its domain struct like * union { -- cgit v1.2.3 From 7c53f4238aa8bfb476e177263133ead2eeb8d55d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:04 -0400 Subject: iommupt: Add unmap_pages op unmap_pages removes mappings and any fully contained interior tables from the given range. This follows the now-standard iommu_domain API definition where it does not split up larger page sizes into smaller. The caller must perform unmap only on ranges created by map or it must have somehow otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to scan for them) A future work will provide 'cut' which explicitly does the page size split if the HW can support it. unmap is implemented with a recursive descent of the tree. If the caller provides a VA range that spans an entire table item then the table memory can be freed as well. If an entire table item can be freed then this version will also check the leaf-only level of the tree to ensure that all entries are present to generate -EINVAL. Many of the existing drivers don't do this extra check. This version sits under the iommu_domain_ops as unmap_pages() but does not require the external page size calculation. The implementation is actually unmap_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize __iommu_unmap() to take advantage of this. Freed page table memory is batched up in the gather and will be freed in the driver's iotlb_sync() callback after the IOTLB flush completes. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 156 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 10 ++- 2 files changed, 164 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 5ff1b887928a..e3d1b272723d 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -14,6 +14,29 @@ #include #include #include "../iommu-pages.h" +#include +#include + +static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, + struct pt_iommu *iommu_table, pt_vaddr_t iova, + pt_vaddr_t len, + struct iommu_pages_list *free_list) +{ + struct pt_common *common = common_from_iommu(iommu_table); + + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); + /* + * Note that the sync frees the gather's free list, so we must + * not have any pages on that list that are covered by iova/len + */ + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); + } + + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); +} #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) @@ -164,6 +187,139 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common, log2_to_int(pt_top_memsize_lg2(common, top_of_table))); } +struct pt_unmap_args { + struct iommu_pages_list free_list; + pt_vaddr_t unmapped; +}; + +static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_unmap_args *unmap = arg; + unsigned int num_oas = 0; + unsigned int start_index; + int ret = 0; + + _pt_iter_first(&pts); + start_index = pts.index; + pts.type = pt_load_entry_raw(&pts); + /* + * A starting index is in the middle of a contiguous entry + * + * The IOMMU API does not require drivers to support unmapping parts of + * large pages. Long ago VFIO would try to split maps but the current + * version never does. + * + * Instead when unmap reaches a partial unmap of the start of a large + * IOPTE it should remove the entire IOPTE and return that size to the + * caller. + */ + if (pts.type == PT_ENTRY_OA) { + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return -EINVAL; + /* Micro optimization */ + goto start_oa; + } + + do { + if (pts.type != PT_ENTRY_OA) { + bool fully_covered; + + if (pts.type != PT_ENTRY_TABLE) { + ret = -EINVAL; + break; + } + + if (pts.index != start_index) + pt_index_to_va(&pts); + pts.table_lower = pt_table_ptr(&pts); + + fully_covered = pt_entry_fully_covered( + &pts, pt_table_item_lg2sz(&pts)); + + ret = pt_descend(&pts, arg, __unmap_range); + if (ret) + break; + + /* + * If the unmapping range fully covers the table then we + * can free it as well. The clear is delayed until we + * succeed in clearing the lower table levels. + */ + if (fully_covered) { + iommu_pages_list_add(&unmap->free_list, + pts.table_lower); + pt_clear_entries(&pts, ilog2(1)); + } + pts.index++; + } else { + unsigned int num_contig_lg2; +start_oa: + /* + * If the caller requested an last that falls within a + * single entry then the entire entry is unmapped and + * the length returned will be larger than requested. + */ + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); + pt_clear_entries(&pts, num_contig_lg2); + num_oas += log2_to_int(num_contig_lg2); + pts.index += log2_to_int(num_contig_lg2); + } + if (pts.index >= pts.end_index) + break; + pts.type = pt_load_entry_raw(&pts); + } while (true); + + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); + return ret; +} + +/** + * unmap_pages() - Make a range of IOVA empty/not present + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_pages() will remove a translation created by map_pages(). It cannot + * subdivide a mapping created by map_pages(), so it should be called with IOVA + * ranges that match those passed to map_pages(). The IOVA range can aggregate + * contiguous map_pages() calls so long as no individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the point + * unmapping stopped. + */ +size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( + unmap.free_list) }; + pt_vaddr_t len = pgsize * pgcount; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); + if (ret) + return 0; + + pt_walk_range(&range, __unmap_range, &unmap); + + gather_range_pages(iotlb_gather, iommu_table, iova, len, + &unmap.free_list); + + return unmap.unmapped; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); + static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 5622856e1998..ceb6bc9cea37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -9,6 +9,7 @@ #include #include +struct iommu_iotlb_gather; struct pt_iommu_ops; /** @@ -119,6 +120,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + size_t pt_iommu_##fmt##_unmap_pages( \ + struct iommu_domain *domain, unsigned long iova, \ + size_t pgsize, size_t pgcount, \ + struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -135,8 +140,9 @@ struct pt_iommu_cfg { * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt */ -#define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* * The driver should setup its domain struct like -- cgit v1.2.3 From dcd6a011a8d523a114af2360a8753de5bd60c139 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:05 -0400 Subject: iommupt: Add map_pages op map is slightly complicated because it has to handle a number of special edge cases: - Overmapping a previously shared, but now empty, table level with an OA. Requries validating and freeing the possibly empty tables - Doing the above across an entire to-be-created contiguous entry - Installing a new shared table level concurrently with another thread - Expanding the table by adding more top levels Table expansion is a unique feature of AMDv1, this version is quite similar except we handle racing concurrent lockless map. The table top pointer and starting level are encoded in a single uintptr_t which ensures we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use that fixed point as its starting point. Concurrent expansion is handled with a table global spinlock. When inserting a new table entry map checks that the entire portion of the table is empty. This includes freeing any empty lower tables that will be overwritten by an OA. A separate free list is used while checking and collecting all the empty lower tables so that writing the new entry is uninterrupted, either the new entry fully writes or nothing changes. A special fast path for PAGE_SIZE is implemented that does a direct walk to the leaf level and installs a single entry. This gives ~15% improvement for iommu_map() when mapping lists of single pages. This version sits under the iommu_domain_ops as map_pages() but does not require the external page size calculation. The implementation is actually map_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize iommu_map() to take advantage of this. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 501 +++++++++++++++++++++++++++++++++++- drivers/iommu/generic_pt/pt_iter.h | 2 +- include/linux/generic_pt/iommu.h | 59 +++++ 3 files changed, 560 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index e3d1b272723d..f32e81509f4f 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -91,6 +91,23 @@ static __maybe_unused int make_range_u64(struct pt_common *common, ret; \ }) +static inline unsigned int compute_best_pgsize(struct pt_state *pts, + pt_oaddr_t oa) +{ + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); + + if (!pt_can_have_leaf(pts)) + return 0; + + /* + * The page size is limited by the domain's bitmap. This allows the core + * code to reduce the supported page sizes by changing the bitmap. + */ + return pt_compute_best_pgsize(pt_possible_sizes(pts) & + iommu_table->domain.pgsize_bitmap, + pts->range->va, pts->range->last_va, oa); +} + static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, unsigned int level, struct pt_table_p *table, @@ -147,6 +164,8 @@ EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); struct pt_iommu_collect_args { struct iommu_pages_list free_list; + /* Fail if any OAs are within the range */ + u8 check_mapped : 1; }; static int __collect_tables(struct pt_range *range, void *arg, @@ -156,7 +175,7 @@ static int __collect_tables(struct pt_range *range, void *arg, struct pt_iommu_collect_args *collect = arg; int ret; - if (!pt_can_have_table(&pts)) + if (!collect->check_mapped && !pt_can_have_table(&pts)) return 0; for_each_pt_level_entry(&pts) { @@ -167,6 +186,8 @@ static int __collect_tables(struct pt_range *range, void *arg, return ret; continue; } + if (pts.type == PT_ENTRY_OA && collect->check_mapped) + return -EADDRINUSE; } return 0; } @@ -187,6 +208,477 @@ static inline struct pt_table_p *table_alloc_top(struct pt_common *common, log2_to_int(pt_top_memsize_lg2(common, top_of_table))); } +/* Allocate an interior table */ +static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, + gfp_t gfp) +{ + struct pt_iommu *iommu_table = + iommu_from_common(parent_pts->range->common); + struct pt_state child_pts = + pt_init(parent_pts->range, parent_pts->level - 1, NULL); + + return iommu_alloc_pages_node_sz( + iommu_table->nid, gfp, + log2_to_int(pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE))); +} + +static inline int pt_iommu_new_table(struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + struct pt_table_p *table_mem; + phys_addr_t phys; + + /* Given PA/VA/length can't be represented */ + if (PT_WARN_ON(!pt_can_have_table(pts))) + return -ENXIO; + + table_mem = table_alloc(pts, attrs->gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + + phys = virt_to_phys(table_mem); + if (!pt_install_table(pts, phys, attrs)) { + iommu_free_pages(table_mem); + return -EAGAIN; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + /* + * The underlying table can't store the physical table address. + * This happens when kunit testing tables outside their normal + * environment where a CPU might be limited. + */ + pt_load_single_entry(pts); + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { + pt_clear_entries(pts, ilog2(1)); + iommu_free_pages(table_mem); + return -EINVAL; + } + } + + pts->table_lower = table_mem; + return 0; +} + +struct pt_iommu_map_args { + struct iommu_iotlb_gather *iotlb_gather; + struct pt_write_attrs attrs; + pt_oaddr_t oa; + unsigned int leaf_pgsize_lg2; + unsigned int leaf_level; +}; + +/* + * This will recursively check any tables in the block to validate they are + * empty and then free them through the gather. + */ +static int clear_contig(const struct pt_state *start_pts, + struct iommu_iotlb_gather *iotlb_gather, + unsigned int step, unsigned int pgsize_lg2) +{ + struct pt_iommu *iommu_table = + iommu_from_common(start_pts->range->common); + struct pt_range range = *start_pts->range; + struct pt_state pts = + pt_init(&range, start_pts->level, start_pts->table); + struct pt_iommu_collect_args collect = { .check_mapped = true }; + int ret; + + pts.index = start_pts->index; + pts.end_index = start_pts->index + step; + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { + if (pts.type == PT_ENTRY_TABLE) { + collect.free_list = + IOMMU_PAGES_LIST_INIT(collect.free_list); + ret = pt_walk_descend_all(&pts, __collect_tables, + &collect); + if (ret) + return ret; + + /* + * The table item must be cleared before we can update + * the gather + */ + pt_clear_entries(&pts, ilog2(1)); + + iommu_pages_list_add(&collect.free_list, + pt_table_ptr(&pts)); + gather_range_pages( + iotlb_gather, iommu_table, range.va, + log2_to_int(pt_table_item_lg2sz(&pts)), + &collect.free_list); + } else if (pts.type != PT_ENTRY_EMPTY) { + return -EADDRINUSE; + } + } + return 0; +} + +static int __map_range_leaf(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int start_index; + pt_oaddr_t oa = map->oa; + unsigned int step; + bool need_contig; + int ret = 0; + + PT_WARN_ON(map->leaf_level != level); + PT_WARN_ON(!pt_can_have_leaf(&pts)); + + step = log2_to_int_t(unsigned int, + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + + _pt_iter_first(&pts); + start_index = pts.index; + do { + pts.type = pt_load_entry_raw(&pts); + if (pts.type != PT_ENTRY_EMPTY || need_contig) { + if (pts.index != start_index) + pt_index_to_va(&pts); + ret = clear_contig(&pts, map->iotlb_gather, step, + leaf_pgsize_lg2); + if (ret) + break; + } + + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { + pt_index_to_va(&pts); + PT_WARN_ON(compute_best_pgsize(&pts, oa) != + leaf_pgsize_lg2); + } + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); + + oa += log2_to_int(leaf_pgsize_lg2); + pts.index += step; + } while (pts.index < pts.end_index); + + map->oa = oa; + return ret; +} + +static int __map_range(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + int ret; + + PT_WARN_ON(map->leaf_level == level); + PT_WARN_ON(!pt_can_have_table(&pts)); + + _pt_iter_first(&pts); + + /* Descend to a child table */ + do { + pts.type = pt_load_entry_raw(&pts); + + if (pts.type != PT_ENTRY_TABLE) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + ret = pt_iommu_new_table(&pts, &map->attrs); + if (ret) { + /* + * Racing with another thread installing a table + */ + if (ret == -EAGAIN) + continue; + return ret; + } + } else { + pts.table_lower = pt_table_ptr(&pts); + } + + /* + * The already present table can possibly be shared with another + * concurrent map. + */ + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + if (ret) + return ret; + + pts.index++; + pt_index_to_va(&pts); + if (pts.index >= pts.end_index) + break; + } while (true); + return 0; +} + +/* + * Fast path for the easy case of mapping a 4k page to an already allocated + * table. This is a common workload. If it returns EAGAIN run the full algorithm + * instead. + */ +static __always_inline int __do_map_single_page(struct pt_range *range, + void *arg, unsigned int level, + struct pt_table_p *table, + pt_level_fn_t descend_fn) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *map = arg; + + pts.type = pt_load_single_entry(&pts); + if (level == 0) { + if (pts.type != PT_ENTRY_EMPTY) + return -EADDRINUSE; + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, + &map->attrs); + map->oa += PAGE_SIZE; + return 0; + } + if (pts.type == PT_ENTRY_TABLE) + return pt_descend(&pts, arg, descend_fn); + /* Something else, use the slow path */ + return -EAGAIN; +} +PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); + +/* + * Add a table to the top, increasing the top level as much as necessary to + * encompass range. + */ +static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); + struct pt_common *common = common_from_iommu(iommu_table); + uintptr_t top_of_table = READ_ONCE(common->top_of_table); + uintptr_t new_top_of_table = top_of_table; + struct pt_table_p *table_mem; + unsigned int new_level; + spinlock_t *domain_lock; + unsigned long flags; + int ret; + + while (true) { + struct pt_range top_range = + _pt_top_range(common, new_top_of_table); + struct pt_state pts = pt_init_top(&top_range); + + top_range.va = range->va; + top_range.last_va = range->last_va; + + if (!pt_check_range(&top_range) && map->leaf_level <= pts.level) + break; + + pts.level++; + if (pts.level > PT_MAX_TOP_LEVEL || + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { + ret = -ERANGE; + goto err_free; + } + + new_level = pts.level; + table_mem = table_alloc_top( + common, _pt_top_set(NULL, pts.level), map->attrs.gfp); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + iommu_pages_list_add(&free_list, table_mem); + + /* The new table links to the lower table always at index 0 */ + top_range.va = 0; + top_range.top_level = new_level; + pts.table_lower = pts.table; + pts.table = table_mem; + pt_load_single_entry(&pts); + PT_WARN_ON(pts.index != 0); + pt_install_table(&pts, virt_to_phys(pts.table_lower), + &map->attrs); + new_top_of_table = _pt_top_set(pts.table, pts.level); + } + + /* + * top_of_table is write locked by the spinlock, but readers can use + * READ_ONCE() to get the value. Since we encode both the level and the + * pointer in one quanta the lockless reader will always see something + * valid. The HW must be updated to the new level under the spinlock + * before top_of_table is updated so that concurrent readers don't map + * into the new level until it is fully functional. If another thread + * already updated it while we were working then throw everything away + * and try again. + */ + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); + spin_lock_irqsave(domain_lock, flags); + if (common->top_of_table != top_of_table) { + spin_unlock_irqrestore(domain_lock, flags); + ret = -EAGAIN; + goto err_free; + } + + /* + * We do not issue any flushes for change_top on the expectation that + * any walk cache will not become a problem by adding another layer to + * the tree. Misses will rewalk from the updated top pointer, hits + * continue to be correct. Negative caching is fine too since all the + * new IOVA added by the new top is non-present. + */ + iommu_table->driver_ops->change_top( + iommu_table, virt_to_phys(table_mem), new_level); + WRITE_ONCE(common->top_of_table, new_top_of_table); + spin_unlock_irqrestore(domain_lock, flags); + return 0; + +err_free: + iommu_put_pages_list(&free_list); + return ret; +} + +static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, + struct pt_iommu_map_args *map) +{ + struct pt_common *common = common_from_iommu(iommu_table); + int ret; + + do { + ret = pt_check_range(range); + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + return ret; + + if (!ret && map->leaf_level <= range->top_level) + break; + + ret = increase_top(iommu_table, range, map); + if (ret && ret != -EAGAIN) + return ret; + + /* Reload the new top */ + *range = pt_make_range(common, range->va, range->last_va); + } while (ret); + PT_WARN_ON(pt_check_range(range)); + return 0; +} + +static int do_map(struct pt_range *range, bool single_page, + struct pt_iommu_map_args *map) +{ + if (single_page) { + int ret; + + ret = pt_walk_range(range, __map_single_page, map); + if (ret != -EAGAIN) + return ret; + /* EAGAIN falls through to the full path */ + } + + if (map->leaf_level == range->top_level) + return pt_walk_range(range, __map_range_leaf, map); + return pt_walk_range(range, __map_range, map); +} + +/** + * map_pages() - Install translation for an IOVA range + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @pgsize: Length of each page + * @pgcount: Length of the range in pgsize units starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * @mapped: Total bytes successfully mapped + * + * The range starting at IOVA will have paddr installed into it. The caller + * must specify a valid pgsize and pgcount to segment the range into compatible + * blocks. + * + * On error the caller will probably want to invoke unmap on the range from iova + * up to the amount indicated by @mapped to return the table back to an + * unchanged state. + * + * Context: The caller must hold a write range lock that includes the whole + * range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were + * mapped are added to @mapped, @mapped is not zerod first. + */ +int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; + struct pt_common *common = common_from_iommu(iommu_table); + struct iommu_iotlb_gather iotlb_gather; + pt_vaddr_t len = pgsize * pgcount; + struct pt_iommu_map_args map = { + .iotlb_gather = &iotlb_gather, + .oa = paddr, + .leaf_pgsize_lg2 = vaffs(pgsize), + }; + bool single_page = false; + struct pt_range range; + int ret; + + iommu_iotlb_gather_init(&iotlb_gather); + + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) + return -EINVAL; + + /* Check the paddr doesn't exceed what the table can store */ + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && + (pt_vaddr_t)paddr > PT_VADDR_MAX) || + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && + oalog2_div(paddr, common->max_oasz_lg2))) + return -ERANGE; + + ret = pt_iommu_set_prot(common, &map.attrs, prot); + if (ret) + return ret; + map.attrs.gfp = gfp; + + ret = make_range_no_check(common, &range, iova, len); + if (ret) + return ret; + + /* Calculate target page size and level for the leaves */ + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && + pgcount == 1) { + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (log2_mod(iova | paddr, PAGE_SHIFT)) + return -ENXIO; + map.leaf_pgsize_lg2 = PAGE_SHIFT; + map.leaf_level = 0; + single_page = true; + } else { + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( + pgsize_bitmap, range.va, range.last_va, paddr); + if (!map.leaf_pgsize_lg2) + return -ENXIO; + map.leaf_level = + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + } + + ret = check_map_range(iommu_table, &range, &map); + if (ret) + return ret; + + PT_WARN_ON(map.leaf_level > range.top_level); + + ret = do_map(&range, single_page, &map); + + /* + * Table levels were freed and replaced with large items, flush any walk + * cache that may refer to the freed levels. + */ + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); + + /* Bytes successfully mapped */ + PT_WARN_ON(!ret && map.oa - paddr != len); + *mapped += map.oa - paddr; + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); + struct pt_unmap_args { struct iommu_pages_list free_list; pt_vaddr_t unmapped; @@ -445,6 +937,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table) memset_after(fmt_table, 0, iommu.domain); /* The caller can initialize some of these values */ + iommu_table->driver_ops = cfg.driver_ops; iommu_table->nid = cfg.nid; } @@ -478,6 +971,12 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table, if (ret) return ret; + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && + WARN_ON(!iommu_table->driver_ops || + !iommu_table->driver_ops->change_top || + !iommu_table->driver_ops->get_top_lock)) + return -EINVAL; + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && (pt_feature(common, PT_FEAT_FULL_VA) || pt_feature(common, PT_FEAT_DYNAMIC_TOP))) diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h index 87f4a26c1a41..c0d8617cce29 100644 --- a/drivers/iommu/generic_pt/pt_iter.h +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -612,7 +612,7 @@ static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, * This builds a function call tree that can be fully inlined. * The caller must provide a function body in an __always_inline function:: * - * static __always_inline int do(struct pt_range *range, void *arg, + * static __always_inline int do_fn(struct pt_range *range, void *arg, * unsigned int level, struct pt_table_p *table, * pt_level_fn_t descend_fn) * diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index ceb6bc9cea37..0d59423024d5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -11,6 +11,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; +struct pt_iommu_driver_ops; /** * DOC: IOMMU Radix Page Table @@ -43,6 +44,12 @@ struct pt_iommu { */ const struct pt_iommu_ops *ops; + /** + * @driver_ops: Function pointers provided by the HW driver to help + * manage HW details like caches. + */ + const struct pt_iommu_driver_ops *driver_ops; + /** * @nid: Node ID to use for table memory allocations. The IOMMU driver * may want to set the NID to the device's NID, if there are multiple @@ -84,6 +91,53 @@ struct pt_iommu_ops { void (*deinit)(struct pt_iommu *iommu_table); }; +/** + * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations + * + * The IOMMU driver should implement these using container_of(iommu_table) to + * get to it's iommu_domain derived structure. All ops can be called in atomic + * contexts as they are buried under DMA API calls. + */ +struct pt_iommu_driver_ops { + /** + * @change_top: Update the top of table pointer + * @iommu_table: Table to operate on + * @top_paddr: New CPU physical address of the top pointer + * @top_level: IOMMU PT level of the new top + * + * Called under the get_top_lock() spinlock. The driver must update all + * HW references to this domain with a new top address and + * configuration. On return mappings placed in the new top must be + * reachable by the HW. + * + * top_level encodes the level in IOMMU PT format, level 0 is the + * smallest page size increasing from there. This has to be translated + * to any HW specific format. During this call the new top will not be + * visible to any other API. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr, + unsigned int top_level); + + /** + * @get_top_lock: lock to hold when changing the table top + * @iommu_table: Table to operate on + * + * Return a lock to hold when changing the table top page table from + * being stored in HW. The lock will be held prior to calling + * change_top() and released once the top is fully visible. + * + * Typically this would be a lock that protects the iommu_domain's + * attachment list. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table); +}; + static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) { /* @@ -120,6 +174,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ + unsigned long iova, phys_addr_t paddr, \ + size_t pgsize, size_t pgcount, \ + int prot, gfp_t gfp, size_t *mapped); \ size_t pt_iommu_##fmt##_unmap_pages( \ struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ @@ -142,6 +200,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* -- cgit v1.2.3 From 4a00f943489103b4b9edff9f39bd484efbfb15fa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:06 -0400 Subject: iommupt: Add read_and_clear_dirty op IOMMU HW now supports updating a dirty bit in an entry when a DMA writes to the entry's VA range. iommufd has a uAPI to read and clear the dirty bits from the tables. This is a trivial recursive descent algorithm to read and optionally clear the dirty bits. The format needs a function to tell if a contiguous entry is dirty, and a function to clear a contiguous entry back to clean. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 104 ++++++++++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 6 +++ 2 files changed, 110 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index f32e81509f4f..448c5796d4a8 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -162,6 +162,108 @@ phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); +struct pt_iommu_dirty_args { + struct iommu_dirty_bitmap *dirty; + unsigned int flags; +}; + +static void record_dirty(struct pt_state *pts, + struct pt_iommu_dirty_args *dirty, + unsigned int num_contig_lg2) +{ + pt_vaddr_t dirty_len; + + if (num_contig_lg2 != ilog2(1)) { + unsigned int index = pts->index; + unsigned int end_index = log2_set_mod_max_t( + unsigned int, pts->index, num_contig_lg2); + + /* Adjust for being contained inside a contiguous page */ + end_index = min(end_index, pts->end_index); + dirty_len = (end_index - index) * + log2_to_int(pt_table_item_lg2sz(pts)); + } else { + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); + } + + if (dirty->dirty->bitmap) + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, + dirty_len); + + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { + pt_entry_make_write_clean(pts); + iommu_iotlb_gather_add_range(dirty->dirty->gather, + pts->range->va, dirty_len); + } +} + +static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, + unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_dirty_args *dirty = arg; + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + ret = pt_descend(&pts, arg, __read_and_clear_dirty); + if (ret) + return ret; + continue; + } + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) + record_dirty(&pts, dirty, + pt_entry_num_contig_lg2(&pts)); + } + return 0; +} + +/** + * read_and_clear_dirty() - Manipulate the HW set write dirty state + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the IOVA + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR + * @dirty: Place to store the dirty bits + * + * Iterate over all the entries in the mapped range and record their write dirty + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then + * the entries will be left dirty, otherwise they are returned to being not + * write dirty. + * + * Context: The caller must hold a read range lock that includes @iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_iommu_dirty_args dirty_args = { + .dirty = dirty, + .flags = flags, + }; + struct pt_range range; + int ret; + +#if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) + return -EOPNOTSUPP; +#endif + + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); + if (ret) + return ret; + + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); + PT_WARN_ON(ret); + return ret; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); + struct pt_iommu_collect_args { struct iommu_pages_list free_list; /* Fail if any OAs are within the range */ @@ -1015,5 +1117,7 @@ EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); MODULE_IMPORT_NS("GENERIC_PT"); +/* For iommu_dirty_bitmap_record() */ +MODULE_IMPORT_NS("IOMMUFD"); #endif /* __GENERIC_PT_IOMMU_PT_H */ diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0d59423024d5..03a906fbe12a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -12,6 +12,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; +struct iommu_dirty_bitmap; /** * DOC: IOMMU Radix Page Table @@ -182,6 +183,9 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \ + int pt_iommu_##fmt##_read_and_clear_dirty( \ + struct iommu_domain *domain, unsigned long iova, size_t size, \ + unsigned long flags, struct iommu_dirty_bitmap *dirty); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -202,6 +206,8 @@ struct pt_iommu_cfg { .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages +#define IOMMU_PT_DIRTY_OPS(fmt) \ + .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty /* * The driver should setup its domain struct like -- cgit v1.2.3 From e5359dcc617a2174d834bab4083340196615d8bd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:08 -0400 Subject: iommupt: Add a mock pagetable format for iommufd selftest to use The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Slightly modify the amdv1 page table to create a real page table that has similar properties: - 2k base granule to simulate something like a 4k page table on a 64K PAGE_SIZE ARM system - Contiguous page support for every PFN order - Dirty tracking AMDv1 is the closest format, as it is the only one that already supports every page size. Tweak it to have only 5 levels and an 11 bit base granule and compile it separately as a format variant. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/fmt/Makefile | 1 + drivers/iommu/generic_pt/fmt/amdv1.h | 18 ++++++++++++++++-- drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 ++++++++++ include/linux/generic_pt/iommu.h | 6 ++++++ 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 32f3956c7509..f0c22cf5f7be 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 +iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock IOMMU_PT_KUNIT_TEST := define create_format diff --git a/drivers/iommu/generic_pt/fmt/amdv1.h b/drivers/iommu/generic_pt/fmt/amdv1.h index aaf76bfd21da..aa8e1a8ec95f 100644 --- a/drivers/iommu/generic_pt/fmt/amdv1.h +++ b/drivers/iommu/generic_pt/fmt/amdv1.h @@ -26,11 +26,23 @@ #include enum { - PT_MAX_OUTPUT_ADDRESS_LG2 = 52, - PT_MAX_VA_ADDRESS_LG2 = 64, PT_ITEM_WORD_SIZE = sizeof(u64), + /* + * The IOMMUFD selftest uses the AMDv1 format with some alterations It + * uses a 2k page size to test cases where the CPU page size is not the + * same. + */ +#ifdef AMDV1_IOMMUFD_SELFTEST + PT_MAX_VA_ADDRESS_LG2 = 56, + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 11, +#else + PT_MAX_VA_ADDRESS_LG2 = 64, + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, PT_MAX_TOP_LEVEL = 5, PT_GRANULE_LG2SZ = 12, +#endif PT_TABLEMEM_LG2SZ = 12, /* The DTE only has these bits for the top phyiscal address */ @@ -374,6 +386,7 @@ static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, } #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init +#ifndef PT_FMT_VARIANT static inline void amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, const struct pt_range *top_range, @@ -384,6 +397,7 @@ amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, info->mode = top_range->top_level + 1; } #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info +#endif #if defined(GENERIC_PT_KUNIT) static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { diff --git a/drivers/iommu/generic_pt/fmt/iommu_mock.c b/drivers/iommu/generic_pt/fmt/iommu_mock.c new file mode 100644 index 000000000000..74e597cba9d9 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_mock.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define AMDV1_IOMMUFD_SELFTEST 1 +#define PT_FMT amdv1 +#define PT_FMT_VARIANT mock +#define PT_SUPPORTED_FEATURES 0 + +#include "iommu_template.h" diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 03a906fbe12a..848a5fb76272 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -237,6 +237,12 @@ struct pt_iommu_amdv1_hw_info { IOMMU_FORMAT(amdv1, amdpt); +/* amdv1_mock is used by the iommufd selftest */ +#define pt_iommu_amdv1_mock pt_iommu_amdv1 +#define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg +struct pt_iommu_amdv1_mock_hw_info; +IOMMU_PROTOTYPES(amdv1_mock); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From e93d5945ed5bb086431e83eed7ab98b6c058cc0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:09 -0400 Subject: iommufd: Change the selftest to use iommupt instead of xarray The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Make it act more like a real iommu driver by replacing the xarray with an iommupt based page table. The new AMDv1 mock format behaves similarly to the xarray. Add set_dirty() as a iommu_pt operation to allow the test suite to simulate HW dirty. Userspace can select between several formats including the normal AMDv1 format and a special MOCK_IOMMUPT_HUGE variation for testing huge page dirty tracking. To make the dirty tracking test work the page table must only store exactly 2M huge pages otherwise the logic the test uses fails. They cannot be broken up or combined. Aside from aligning the selftest with a real page table implementation, this helps test the iommupt code itself. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 39 +++ drivers/iommu/iommufd/Kconfig | 1 + drivers/iommu/iommufd/iommufd_test.h | 11 +- drivers/iommu/iommufd/selftest.c | 424 +++++++++++--------------- include/linux/generic_pt/iommu.h | 12 + tools/testing/selftests/iommu/iommufd.c | 60 ++-- tools/testing/selftests/iommu/iommufd_utils.h | 12 + 7 files changed, 282 insertions(+), 277 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 448c5796d4a8..142001f5aa83 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -264,6 +264,41 @@ int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); +static inline int __set_dirty(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __set_dirty); + case PT_ENTRY_OA: + if (!pt_entry_make_write_dirty(&pts)) + return -EAGAIN; + return 0; + } + return -ENOENT; +} + +static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, + dma_addr_t iova) +{ + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); + if (ret) + return ret; + + /* + * Note: There is no locking here yet, if the test suite races this it + * can crash. It should use RCU locking eventually. + */ + return pt_walk_range(&range, __set_dirty, NULL); +} + struct pt_iommu_collect_args { struct iommu_pages_list free_list; /* Fail if any OAs are within the range */ @@ -957,6 +992,10 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) + .set_dirty = NS(set_dirty), +#endif .get_info = NS(get_info), .deinit = NS(deinit), }; diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 2beeb4f60ee5..eae3f03629b0 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -41,6 +41,7 @@ config IOMMUFD_TEST depends on DEBUG_KERNEL depends on FAULT_INJECTION depends on RUNTIME_TESTING_MENU + depends on IOMMU_PT_AMDV1 select IOMMUFD_DRIVER default n help diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf9..781a75c79eea 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -31,9 +31,18 @@ enum { IOMMU_TEST_OP_PASID_CHECK_HWPT, }; +enum { + MOCK_IOMMUPT_DEFAULT = 0, + MOCK_IOMMUPT_HUGE, + MOCK_IOMMUPT_AMDV1, +}; + +/* These values are true for MOCK_IOMMUPT_DEFAULT */ enum { MOCK_APERTURE_START = 1UL << 24, MOCK_APERTURE_LAST = (1UL << 31) - 1, + MOCK_PAGE_SIZE = 2048, + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, }; enum { @@ -52,7 +61,6 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; @@ -205,6 +213,7 @@ struct iommu_test_hw_info { */ struct iommu_hwpt_selftest { __u32 iotlb; + __u32 pagetable_type; }; /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */ diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 5661d2da2b67..f6379f387d3b 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include "../iommu-pages.h" #include "../iommu-priv.h" #include "io_pagetable.h" @@ -41,21 +43,6 @@ static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, - - /* - * Like a real page table alignment requires the low bits of the address - * to be zero. xarray also requires the high bit to be zero, so we store - * the pfns shifted. The upper bits are used for metadata. - */ - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, - - _MOCK_PFN_START = MOCK_PFN_MASK + 1, - MOCK_PFN_START_IOVA = _MOCK_PFN_START, - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); @@ -124,10 +111,15 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu iommu; + struct pt_iommu_amdv1 amdv1; + }; unsigned long flags; - struct iommu_domain domain; - struct xarray pfns; }; +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); static inline struct mock_iommu_domain * to_mock_domain(struct iommu_domain *domain) @@ -344,74 +336,6 @@ static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, return 0; } -static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, - unsigned long iova, size_t page_size, - unsigned long flags) -{ - unsigned long cur, end = iova + page_size - 1; - bool dirty = false; - void *ent, *old; - - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) - continue; - - dirty = true; - /* Clear dirty */ - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { - unsigned long val; - - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - } - } - - return dirty; -} - -static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long end = iova + size; - void *ent; - - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) - return -EINVAL; - - do { - unsigned long pgsize = MOCK_IO_PAGE_SIZE; - unsigned long head; - - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent) { - iova += pgsize; - continue; - } - - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) - pgsize = MOCK_HUGE_PAGE_SIZE; - head = iova & ~(pgsize - 1); - - /* Clear dirty */ - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -static const struct iommu_dirty_ops dirty_ops = { - .set_dirty_tracking = mock_domain_set_dirty_tracking, - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, -}; - static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { @@ -446,7 +370,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); - if (!parent || parent->ops != mock_ops.default_domain_ops) + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) return ERR_PTR(-EINVAL); mock_parent = to_mock_domain(parent); @@ -459,159 +383,170 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, return &mock_nested->domain; } -static struct iommu_domain * -mock_domain_alloc_paging_flags(struct device *dev, u32 flags, - const struct iommu_user_data *user_data) -{ - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_PASID; - struct mock_dev *mdev = to_mock_dev(dev); - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct mock_iommu_domain *mock; - - if (user_data) - return ERR_PTR(-EOPNOTSUPP); - if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - - mock = kzalloc(sizeof(*mock), GFP_KERNEL); - if (!mock) - return ERR_PTR(-ENOMEM); - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; - mock->domain.ops = mock_ops.default_domain_ops; - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - xa_init(&mock->pfns); - - if (has_dirty_flag) - mock->domain.dirty_ops = &dirty_ops; - return &mock->domain; -} - static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = to_mock_domain(domain); - WARN_ON(!xa_empty(&mock->pfns)); + pt_iommu_deinit(&mock->iommu); kfree(mock); } -static int mock_domain_map_pages(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t pgsize, size_t pgcount, int prot, - gfp_t gfp, size_t *mapped) +static void mock_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - unsigned long flags = MOCK_PFN_START_IOVA; - unsigned long start_iova = iova; + iommu_put_pages_list(&gather->freelist); +} - /* - * xarray does not reliably work with fault injection because it does a - * retry allocation, so put our own failure point. - */ - if (iommufd_should_fail()) - return -ENOENT; +static const struct iommu_domain_ops amdv1_mock_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); - for (; pgcount; pgcount--) { - size_t cur; +static const struct iommu_domain_ops amdv1_mock_huge_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1_mock), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; +#undef pt_iommu_amdv1_mock_map_pages - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - void *old; +static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1_mock), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - flags = MOCK_PFN_LAST_IOVA; - if (pgsize != MOCK_IO_PAGE_SIZE) { - flags |= MOCK_PFN_HUGE_IOVA; - } - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | - flags), - gfp); - if (xa_is_err(old)) { - for (; start_iova != iova; - start_iova += MOCK_IO_PAGE_SIZE) - xa_erase(&mock->pfns, - start_iova / - MOCK_IO_PAGE_SIZE); - return xa_err(old); - } - WARN_ON(old); - iova += MOCK_IO_PAGE_SIZE; - paddr += MOCK_IO_PAGE_SIZE; - *mapped += MOCK_IO_PAGE_SIZE; - flags = 0; - } - } - return 0; -} +static const struct iommu_domain_ops amdv1_ops = { + IOMMU_PT_DOMAIN_OPS(amdv1), + .free = mock_domain_free, + .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, + .iotlb_sync = &mock_iotlb_sync, +}; + +static const struct iommu_dirty_ops amdv1_dirty_ops = { + IOMMU_PT_DIRTY_OPS(amdv1), + .set_dirty_tracking = mock_domain_set_dirty_tracking, +}; -static size_t mock_domain_unmap_pages(struct iommu_domain *domain, - unsigned long iova, size_t pgsize, - size_t pgcount, - struct iommu_iotlb_gather *iotlb_gather) +static struct mock_iommu_domain * +mock_domain_alloc_pgtable(struct device *dev, + const struct iommu_hwpt_selftest *user_cfg, u32 flags) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - bool first = true; - size_t ret = 0; - void *ent; + struct mock_iommu_domain *mock; + int rc; - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return ERR_PTR(-ENOMEM); + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; - for (; pgcount; pgcount--) { - size_t cur; + mock->amdv1.iommu.nid = NUMA_NO_NODE; + + switch (user_cfg->pagetable_type) { + case MOCK_IOMMUPT_DEFAULT: + case MOCK_IOMMUPT_HUGE: { + struct pt_iommu_amdv1_cfg cfg = {}; + + /* The mock version has a 2k page size */ + cfg.common.hw_max_vasz_lg2 = 56; + cfg.common.hw_max_oasz_lg2 = 51; + cfg.starting_level = 2; + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.ops = &amdv1_mock_huge_ops; + else + mock->domain.ops = &amdv1_mock_ops; + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + + /* + * In huge mode userspace should only provide huge pages, we + * have to include PAGE_SIZE for the domain to be accepted by + * iommufd. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | + PAGE_SIZE; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; + break; + } - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + case MOCK_IOMMUPT_AMDV1: { + struct pt_iommu_amdv1_cfg cfg = {}; + + cfg.common.hw_max_vasz_lg2 = 64; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); + cfg.starting_level = 2; + mock->domain.ops = &amdv1_ops; + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); + if (rc) + goto err_free; + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + mock->domain.dirty_ops = &amdv1_dirty_ops; + break; + } + default: + rc = -EOPNOTSUPP; + goto err_free; + } - /* - * iommufd generates unmaps that must be a strict - * superset of the map's performend So every - * starting/ending IOVA should have been an iova passed - * to map. - * - * This simple logic doesn't work when the HUGE_PAGE is - * turned on since the core code will automatically - * switch between the two page sizes creating a break in - * the unmap calls. The break can land in the middle of - * contiguous IOVA. - */ - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { - if (first) { - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); - first = false; - } - if (pgcount == 1 && - cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(ent && !(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); - } + /* + * Override the real aperture to the MOCK aperture for test purposes. + */ + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { + WARN_ON(mock->domain.geometry.aperture_start != 0); + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); - iova += MOCK_IO_PAGE_SIZE; - ret += MOCK_IO_PAGE_SIZE; - } + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; } - return ret; + + return mock; +err_free: + kfree(mock); + return ERR_PTR(rc); } -static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, - dma_addr_t iova) +static struct iommu_domain * +mock_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) { - struct mock_iommu_domain *mock = to_mock_domain(domain); - void *ent; + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; + struct mock_dev *mdev = to_mock_dev(dev); + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_hwpt_selftest user_cfg = {}; + struct mock_iommu_domain *mock; + int rc; + + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && + user_data->type != IOMMU_HWPT_DATA_NONE)) + return ERR_PTR(-EOPNOTSUPP); - WARN_ON(iova % MOCK_IO_PAGE_SIZE); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; + if (user_data) { + rc = iommu_copy_struct_from_user( + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + } + + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); + if (IS_ERR(mock)) + return ERR_CAST(mock); + return &mock->domain; } static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) @@ -955,15 +890,6 @@ static const struct iommu_ops mock_ops = { .user_pasid_table = true, .get_viommu_size = mock_get_viommu_size, .viommu_init = mock_viommu_init, - .default_domain_ops = - &(struct iommu_domain_ops){ - .free = mock_domain_free, - .attach_dev = mock_domain_nop_attach, - .map_pages = mock_domain_map_pages, - .unmap_pages = mock_domain_unmap_pages, - .iova_to_phys = mock_domain_iova_to_phys, - .set_dev_pasid = mock_domain_set_dev_pasid_nop, - }, }; static void mock_domain_free_nested(struct iommu_domain *domain) @@ -1047,7 +973,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || - hwpt->domain->ops != mock_ops.default_domain_ops) { + hwpt->domain->owner != &mock_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } @@ -1088,7 +1014,6 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) {}, }; const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | - MOCK_FLAGS_DEVICE_HUGE_IOVA | MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; @@ -1277,23 +1202,25 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; + unsigned int page_size; uintptr_t end; int rc; - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) - return -EINVAL; - hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - for (; length; length -= MOCK_IO_PAGE_SIZE) { + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); + if (iova % page_size || length % page_size || + (uintptr_t)uptr % page_size || + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) + return -EINVAL; + + for (; length; length -= page_size) { struct page *pages[1]; + phys_addr_t io_phys; unsigned long pfn; long npages; - void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); @@ -1308,15 +1235,14 @@ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, pfn = page_to_pfn(pages[0]); put_page(pages[0]); - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - if (!ent || - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); + if (io_phys != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } - iova += MOCK_IO_PAGE_SIZE; - uptr += MOCK_IO_PAGE_SIZE; + iova += page_size; + uptr += page_size; } rc = 0; @@ -1795,7 +1721,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - if (!(mock->flags & MOCK_DIRTY_TRACK)) { + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { rc = -EINVAL; goto out_put; } @@ -1814,22 +1740,10 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, } for (i = 0; i < max; i++) { - unsigned long cur = iova + i * page_size; - void *ent, *old; - if (!test_bit(i, (unsigned long *)tmp)) continue; - - ent = xa_load(&mock->pfns, cur / page_size); - if (ent) { - unsigned long val; - - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; - old = xa_store(&mock->pfns, cur / page_size, - xa_mk_value(val), GFP_KERNEL); - WARN_ON_ONCE(ent != old); - count++; - } + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); + count++; } cmd->dirty.out_nr_dirty = count; @@ -2202,3 +2116,5 @@ void iommufd_test_exit(void) platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); } + +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 848a5fb76272..f2a763aba088 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -73,6 +73,18 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @set_dirty: Make the iova write dirty + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * + * This is only used by iommufd testing. It makes the iova dirty so that + * read_and_clear_dirty() will see it as dirty. Unlike all the other ops + * this one is safe to call without holding any locking. It may return + * -EAGAIN if there is a race. + */ + int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova); + /** * @get_info: Return the pt_iommu_info structure * @iommu_table: Table to query diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 3eebf5e3b974..595b0a3ead64 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -13,9 +13,6 @@ static unsigned long HUGEPAGE_SIZE; -#define MOCK_PAGE_SIZE (PAGE_SIZE / 2) -#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) - static unsigned long get_huge_page_size(void) { char buf[80]; @@ -2058,6 +2055,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_SET, + .val64 = 0, + }; size_t mmap_buffer_size; unsigned long size; int mmap_flags; @@ -2066,7 +2069,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking) if (variant->buffer_size < MOCK_PAGE_SIZE) { SKIP(return, - "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u", variant->buffer_size, MOCK_PAGE_SIZE); } @@ -2114,16 +2117,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - /* Enable 1M mock IOMMU hugepages */ - if (variant->hugepages) { - test_cmd_mock_domain_flags(self->ioas_id, - MOCK_FLAGS_DEVICE_HUGE_IOVA, - &self->stdev_id, &self->hwpt_id, - &self->idev_id); - } else { - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id, &self->idev_id); - } + + /* + * For dirty testing it is important that the page size fed into + * the iommu page tables matches the size the dirty logic + * expects, or set_dirty can touch too much stuff. + */ + cmd.object_id = self->ioas_id; + if (!variant->hugepages) + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -2248,18 +2253,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); @@ -2285,18 +2295,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 772ca1db6e59..08e529fde1cc 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 0)) +#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \ + hwpt_id) \ + ({ \ + struct iommu_hwpt_selftest user_cfg = { \ + .pagetable_type = iommupt_type \ + }; \ + \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, 0, flags, \ + hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \ + &user_cfg, sizeof(user_cfg))); \ + }) #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ self->fd, device_id, pt_id, 0, flags, \ -- cgit v1.2.3 From aef5de756ea871ab44e3a1a87be6c944e6587c51 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:10 -0400 Subject: iommupt: Add the x86 64 bit page table format This is used by x86 CPUs and can be used in AMD/VT-d x86 IOMMUs. When a x86 IOMMU is running SVA the MM will be using this format. This implementation follows the AMD v2 io-pgtable version. There is nothing remarkable here, the format can have 4 or 5 levels and limited support for different page sizes. No contiguous pages support. x86 uses a sign extension mechanism where the top bits of the VA must match the sign bit. The core code supports this through PT_FEAT_SIGN_EXTEND which creates and upper and lower VA range. All the new operations will work correctly in both spaces, however currently there is no way to report the upper space to other layers. Future patches can improve that. In principle this can support 3 page tables levels matching the 32 bit PAE table format, but no iommu driver needs this. The focus is on the modern 64 bit 4 and 5 level formats. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 71,61 , 66,58 , -13.13 2^21, 66,60 , 61,55 , -10.10 2^30, 59,56 , 56,54 , -3.03 256*2^12, 392,1360 , 345,1289 , 73.73 256*2^21, 383,1159 , 335,1145 , 70.70 256*2^30, 378,965 , 331,892 , 62.62 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 77,71 , 73,68 , -7.07 2^21, 76,70 , 70,66 , -6.06 2^30, 69,66 , 66,63 , -4.04 256*2^12, 225,899 , 210,870 , 75.75 256*2^21, 262,722 , 248,710 , 65.65 256*2^30, 251,643 , 244,634 , 61.61 The small -ve values in the iommu_unmap() are due to the core code calling iommu_pgsize() before invoking the domain op. This is unncessary with this implementation. Future work optimizes this and gets to 2%, 4%, 3%. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 ++ drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +++ drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 ++ drivers/iommu/generic_pt/fmt/x86_64.h | 255 ++++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 13 ++ include/linux/generic_pt/iommu.h | 11 ++ 8 files changed, 325 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 936c327f0661..2016c5e5ac0f 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y CONFIG_IOMMUFD=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 81652cd9c69f..6dcb771b3c58 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,10 +42,21 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_X86_64 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the x86 64-bit 4/5 level page table. + It supports 4K/2M/1G page sizes and can decode a sign-extended + portion of the 64-bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_KUNIT_TEST tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index f0c22cf5f7be..5a3379107999 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 + IOMMU_PT_KUNIT_TEST := define create_format obj-$(2) += iommu_$(1).o diff --git a/drivers/iommu/generic_pt/fmt/defs_x86_64.h b/drivers/iommu/generic_pt/fmt/defs_x86_64.h new file mode 100644 index 000000000000..6f589e1f55d3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_x86_64.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_X86_64_H +#define __GENERIC_PT_FMT_DEFS_X86_64_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct x86_64_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs x86_64_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_x86_64.c b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c new file mode 100644 index 000000000000..5c5960d871a3 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_x86_64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT x86_64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/x86_64.h b/drivers/iommu/generic_pt/fmt/x86_64.h new file mode 100644 index 000000000000..18d736d14b2d --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/x86_64.h @@ -0,0 +1,255 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + * + * x86 page table. Supports the 4 and 5 level variations. + * + * The 4 and 5 level version is described in: + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software + * Developer's Manual Volume 3 + * + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization + * Technology for Directed I/O Architecture Specification" + * + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O + * Virtualization Technology (IOMMU) Specification" + * + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. + * + * Note the 3 level format is very similar and almost implemented here. The + * reserved/ignored layout is different and there are functional bit + * differences. + * + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign + * extended addressing with this page table format. + * + * The named levels in the spec map to the pts->level as: + * Table/PTE - 0 + * Directory/PDE - 1 + * Directory Ptr/PDPTE - 2 + * PML4/PML4E - 3 + * PML5/PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_X86_64_H +#define __GENERIC_PT_FMT_X86_64_H + +#include "defs_x86_64.h" +#include "../pt_defs.h" + +#include +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k + * aligned and is limited by the architected HAW + */ + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), +}; + +/* Shared descriptor bits */ +enum { + X86_64_FMT_P = BIT(0), + X86_64_FMT_RW = BIT(1), + X86_64_FMT_U = BIT(2), + X86_64_FMT_A = BIT(5), + X86_64_FMT_D = BIT(6), + X86_64_FMT_OA = GENMASK_ULL(51, 12), + X86_64_FMT_XD = BIT_ULL(63), +}; + +/* PDPTE/PDE */ +enum { + X86_64_FMT_PS = BIT(7), +}; + +static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa x86_64_pt_table_pa + +static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) +{ + u64 entry = pts->entry; + + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_clr(entry); + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa x86_64_pt_entry_oa + +static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf x86_64_pt_can_have_leaf + +static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 x86_64_pt_num_items_lg2 + +static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & X86_64_FMT_P)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw x86_64_pt_load_entry_raw + +static inline void +x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = X86_64_FMT_P | + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= X86_64_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry x86_64_pt_install_leaf_entry + +static inline bool x86_64_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + entry = __sme_set(entry); + return pt_table_install64(pts, entry); +} +#define pt_install_table x86_64_pt_install_table + +static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | + X86_64_FMT_D | X86_64_FMT_XD); +} +#define pt_attr_from_entry x86_64_pt_attr_from_entry + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_x86_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->x86_64_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) + ->iommu; +} + +static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D; + if (iommu_prot & IOMMU_WRITE) + pte |= X86_64_FMT_RW; + + /* + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to + * control this. For now if the tables use sme_set then so do the ptes. + */ + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) + pte = __sme_set(pte); + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot x86_64_pt_iommu_set_prot + +static inline int +x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, + const struct pt_iommu_x86_64_cfg *cfg) +{ + struct pt_x86_64 *table = &iommu_table->x86_64_pt; + + if (cfg->common.hw_max_vasz_lg2 < 31 || + cfg->common.hw_max_vasz_lg2 > 57) + return -EINVAL; + + /* Top of 2, 3, 4 */ + pt_top_set_level(&table->common, + (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2); + + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init + +static inline void +x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, + const struct pt_range *top_range, + struct pt_iommu_x86_64_hw_info *info) +{ + info->gcr3_pt = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); + info->levels = top_range->top_level + 1; +} +#define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 48 }, + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), + .common.hw_max_vasz_lg2 = 57 }, + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ + [2] = { .common.hw_max_vasz_lg2 = 47 }, + [3] = { .common.hw_max_vasz_lg2 = 56 }, +}; +#define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 21e33489cbf2..96f8a6a7d60e 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -151,4 +151,17 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_x86_64 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index f2a763aba088..fde7ccf007c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -255,6 +255,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_x86_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_x86_64_hw_info { + u64 gcr3_pt; + u8 levels; +}; + +IOMMU_FORMAT(x86_64, x86_64_pt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 2fdf6db436e3071a8e4c9c3e67674448a13860d4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:12 -0400 Subject: iommu/amd: Remove AMD io_pgtable support None of this is used anymore, delete it. Reviewed-by: Alejandro Jimenez Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/amd/Makefile | 2 +- drivers/iommu/amd/amd_iommu_types.h | 98 ------ drivers/iommu/amd/io_pgtable.c | 575 ------------------------------------ drivers/iommu/amd/io_pgtable_v2.c | 370 ----------------------- drivers/iommu/io-pgtable.c | 4 - include/linux/io-pgtable.h | 2 - 6 files changed, 1 insertion(+), 1050 deletions(-) delete mode 100644 drivers/iommu/amd/io_pgtable.c delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c (limited to 'include/linux') diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index 59c04a67f398..5412a563c697 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o +obj-y += iommu.o init.o quirks.o ppr.o pasid.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index d90a285b44eb..4b4a37fad70e 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -18,7 +18,6 @@ #include #include #include -#include #include /* @@ -338,76 +337,7 @@ #define GUEST_PGTABLE_4_LEVEL 0x00 #define GUEST_PGTABLE_5_LEVEL 0x01 -#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) -#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ - (0xffffffffffffffffULL)) -#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) -#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) -#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) - -#define PM_MAP_4k 0 #define PM_ADDR_MASK 0x000ffffffffff000ULL -#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ - (~((1ULL << (12 + ((lvl) * 9))) - 1))) -#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) - -/* - * Returns the page table level to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_LEVEL(pagesize) \ - ((__ffs(pagesize) - 12) / 9) -/* - * Returns the number of ptes to use for a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_PTE_COUNT(pagesize) \ - (1ULL << ((__ffs(pagesize) - 12) % 9)) - -/* - * Aligns a given io-virtual address to a given page size - * Pagesize is expected to be a power-of-two - */ -#define PAGE_SIZE_ALIGN(address, pagesize) \ - ((address) & ~((pagesize) - 1)) -/* - * Creates an IOMMU PTE for an address and a given pagesize - * The PTE has no permission bits set - * Pagesize is expected to be a power-of-two larger than 4096 - */ -#define PAGE_SIZE_PTE(address, pagesize) \ - (((address) | ((pagesize) - 1)) & \ - (~(pagesize >> 1)) & PM_ADDR_MASK) - -/* - * Takes a PTE value with mode=0x07 and returns the page size it maps - */ -#define PTE_PAGE_SIZE(pte) \ - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) - -/* - * Takes a page-table level and returns the default page-size for this level - */ -#define PTE_LEVEL_PAGE_SIZE(level) \ - (1ULL << (12 + (9 * (level)))) - -/* - * The IOPTE dirty bit - */ -#define IOMMU_PTE_HD_BIT (6) - -/* - * Bit value definition for I/O PTE fields - */ -#define IOMMU_PTE_PR BIT_ULL(0) -#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) -#define IOMMU_PTE_U BIT_ULL(59) -#define IOMMU_PTE_FC BIT_ULL(60) -#define IOMMU_PTE_IR BIT_ULL(61) -#define IOMMU_PTE_IW BIT_ULL(62) /* * Bit value definition for DTE fields @@ -437,12 +367,6 @@ /* DTE[128:179] | DTE[184:191] */ #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) -#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) -#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) -#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) -#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) -#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) - #define IOMMU_PROT_MASK 0x03 #define IOMMU_PROT_IR 0x01 #define IOMMU_PROT_IW 0x02 @@ -535,19 +459,6 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) -#define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl) - -#define io_pgtable_ops_to_data(x) \ - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) - -#define io_pgtable_ops_to_domain(x) \ - container_of(io_pgtable_ops_to_data(x), \ - struct protection_domain, iop) - -#define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl.cfg) - struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ int glx; /* Number of levels for GCR3 table */ @@ -555,14 +466,6 @@ struct gcr3_tbl_info { u16 domid; /* Per device domain ID */ }; -struct amd_io_pgtable { - seqcount_t seqcount; /* Protects root/mode update */ - struct io_pgtable pgtbl; - int mode; - u64 *root; - u64 *pgd; /* v2 pgtable pgd pointer */ -}; - enum protection_domain_mode { PD_MODE_NONE, PD_MODE_V1, @@ -597,7 +500,6 @@ struct protection_domain { struct pt_iommu_x86_64 amdv2; }; struct list_head dev_list; /* List of all devices in this domain */ - struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c deleted file mode 100644 index f64244938c9a..000000000000 --- a/drivers/iommu/amd/io_pgtable.c +++ /dev/null @@ -1,575 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table allocator. - * - * Copyright (C) 2020 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - -static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) -{ - u64 *p; - int i; - - for (i = 0; i < 512; ++i) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - /* Large PTE? */ - if (PM_PTE_LEVEL(pt[i]) == 0 || - PM_PTE_LEVEL(pt[i]) == 7) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = IOMMU_PTE_PAGE(pt[i]); - if (lvl > 2) - free_pt_lvl(p, freelist, lvl - 1); - else - iommu_pages_list_add(freelist, p); - } - - iommu_pages_list_add(freelist, pt); -} - -static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - iommu_pages_list_add(freelist, root); - break; - case PAGE_MODE_2_LEVEL: - case PAGE_MODE_3_LEVEL: - case PAGE_MODE_4_LEVEL: - case PAGE_MODE_5_LEVEL: - case PAGE_MODE_6_LEVEL: - free_pt_lvl(root, freelist, mode); - break; - default: - BUG(); - } -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned int page_size_level, - gfp_t gfp) -{ - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - struct protection_domain *domain = - container_of(pgtable, struct protection_domain, iop); - unsigned long flags; - bool ret = true; - u64 *pte; - - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); - if (!pte) - return false; - - spin_lock_irqsave(&domain->lock, flags); - - if (address <= PM_LEVEL_SIZE(pgtable->mode) && - pgtable->mode - 1 >= page_size_level) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) - goto out; - - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - - write_seqcount_begin(&pgtable->seqcount); - pgtable->root = pte; - pgtable->mode += 1; - write_seqcount_end(&pgtable->seqcount); - - pte = NULL; - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - iommu_free_pages(pte); - - return ret; -} - -static u64 *alloc_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - unsigned long last_addr = address + (page_size - 1); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned int seqcount; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(pgtable, last_addr, - PAGE_SIZE_LEVEL(page_size), gfp)) - return NULL; - } - - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, - SZ_4K); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long address, - unsigned long *page_size) -{ - int level; - unsigned int seqcount; - u64 *pte; - - *page_size = 0; - - if (address > PM_LEVEL_SIZE(pgtable->mode)) - return NULL; - - do { - seqcount = read_seqcount_begin(&pgtable->seqcount); - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); - - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static void free_clear_pte(u64 *pte, u64 pteval, - struct iommu_pages_list *freelist) -{ - u64 *pt; - int mode; - - while (!try_cmpxchg64(pte, &pteval, 0)) - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - - if (!IOMMU_PTE_PRESENT(pteval)) - return; - - pt = IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - size_t size = pgcount << __ffs(pgsize); - unsigned long o_iova = iova; - - BUG_ON(!IS_ALIGNED(iova, pgsize)); - BUG_ON(!IS_ALIGNED(paddr, pgsize)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - while (pgcount > 0) { - count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - free_clear_pte(&pte[i], pte[i], &freelist); - - if (!iommu_pages_list_empty(&freelist)) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - iova += pgsize; - paddr += pgsize; - pgcount--; - if (mapped) - *mapped += pgsize; - } - - ret = 0; - -out: - if (updated) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - amd_iommu_domain_flush_pages(dom, o_iova, size); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - iommu_put_pages_list(&freelist); - - return ret; -} - -static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - size_t size = pgcount << __ffs(pgsize); - - BUG_ON(!is_power_of_2(pgsize)); - - unmapped = 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } else { - return unmapped; - } - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, - unsigned long flags) -{ - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count && test_only; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { - dirty = true; - break; - } - } - - for (i = 0; i < count && !test_only; i++) { - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *)&ptep[i])) { - dirty = true; - } - } - - return dirty; -} - -static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, - unsigned long iova, size_t size, - unsigned long flags, - struct iommu_dirty_bitmap *dirty) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long end = iova + size - 1; - - do { - unsigned long pgsize = 0; - u64 *ptep, pte; - - ptep = fetch_pte(pgtable, iova, &pgsize); - if (ptep) - pte = READ_ONCE(*ptep); - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); - iova += pgsize; - continue; - } - - /* - * Mark the whole IOVA range as dirty even if only one of - * the replicated PTEs were marked dirty. - */ - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) - iommu_dirty_bitmap_record(dirty, iova, pgsize); - iova += pgsize; - } while (iova < end); - - return 0; -} - -/* - * ---------------------------------------------------- - */ -static void v1_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - /* Page-table is not visible to IOMMU anymore, so free it */ - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > amd_iommu_hpt_level); - - free_sub_pt(pgtable->root, pgtable->mode, &freelist); - iommu_put_pages_list(&freelist); -} - -static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - - pgtable->root = - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->root) - return NULL; - pgtable->mode = PAGE_MODE_3_LEVEL; - seqcount_init(&pgtable->seqcount); - - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { - .alloc = v1_alloc_pgtable, - .free = v1_free_pgtable, -}; diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c deleted file mode 100644 index b47941353ccb..000000000000 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ /dev/null @@ -1,370 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * CPU-agnostic AMD IO page table v2 allocator. - * - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. - * Author: Suravee Suthikulpanit - * Author: Vasant Hegde - */ - -#define pr_fmt(fmt) "AMD-Vi: " fmt -#define dev_fmt(fmt) pr_fmt(fmt) - -#include -#include -#include - -#include - -#include "amd_iommu_types.h" -#include "amd_iommu.h" -#include "../iommu-pages.h" - -#define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ -#define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ -#define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ -#define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ -#define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ -#define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ -#define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ -#define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ -#define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ - -#define MAX_PTRS_PER_PAGE 512 - -#define IOMMU_PAGE_SIZE_2M BIT_ULL(21) -#define IOMMU_PAGE_SIZE_1G BIT_ULL(30) - - -static inline int get_pgtable_level(void) -{ - return amd_iommu_gpt_level; -} - -static inline bool is_large_pte(u64 pte) -{ - return (pte & IOMMU_PAGE_PSE); -} - -static inline u64 set_pgtable_attr(u64 *page) -{ - u64 prot; - - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS; - - return (iommu_virt_to_phys(page) | prot); -} - -static inline void *get_pgtable_pte(u64 pte) -{ - return iommu_phys_to_virt(pte & PM_ADDR_MASK); -} - -static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) -{ - u64 pte; - - pte = __sme_set(paddr & PM_ADDR_MASK); - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; - - if (prot & IOMMU_PROT_IW) - pte |= IOMMU_PAGE_RW; - - /* Large page */ - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) - pte |= IOMMU_PAGE_PSE; - - return pte; -} - -static inline u64 get_alloc_page_size(u64 size) -{ - if (size >= IOMMU_PAGE_SIZE_1G) - return IOMMU_PAGE_SIZE_1G; - - if (size >= IOMMU_PAGE_SIZE_2M) - return IOMMU_PAGE_SIZE_2M; - - return PAGE_SIZE; -} - -static inline int page_size_to_level(u64 pg_size) -{ - if (pg_size == IOMMU_PAGE_SIZE_1G) - return PAGE_MODE_3_LEVEL; - if (pg_size == IOMMU_PAGE_SIZE_2M) - return PAGE_MODE_2_LEVEL; - - return PAGE_MODE_1_LEVEL; -} - -static void free_pgtable(u64 *pt, int level) -{ - u64 *p; - int i; - - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { - /* PTE present? */ - if (!IOMMU_PTE_PRESENT(pt[i])) - continue; - - if (is_large_pte(pt[i])) - continue; - - /* - * Free the next level. No need to look at l1 tables here since - * they can only contain leaf PTEs; just free them directly. - */ - p = get_pgtable_pte(pt[i]); - if (level > 2) - free_pgtable(p, level - 1); - else - iommu_free_pages(p); - } - - iommu_free_pages(pt); -} - -/* Allocate page table */ -static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, - unsigned long pg_size, gfp_t gfp, bool *updated) -{ - u64 *pte, *page; - int level, end_level; - - level = get_pgtable_level() - 1; - end_level = page_size_to_level(pg_size); - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); - - while (level >= end_level) { - u64 __pte, __npte; - - __pte = *pte; - - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { - /* Unmap large pte */ - cmpxchg64(pte, *pte, 0ULL); - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte)) { - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); - if (!page) - return NULL; - - __npte = set_pgtable_attr(page); - /* pte could have been changed somewhere. */ - if (!try_cmpxchg64(pte, &__pte, __npte)) - iommu_free_pages(page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - level -= 1; - pte = get_pgtable_pte(__pte); - pte = &pte[PM_LEVEL_INDEX(level, iova)]; - } - - /* Tear down existing pte entries */ - if (IOMMU_PTE_PRESENT(*pte)) { - u64 *__pte; - - *updated = true; - __pte = get_pgtable_pte(*pte); - cmpxchg64(pte, *pte, 0ULL); - if (pg_size == IOMMU_PAGE_SIZE_1G) - free_pgtable(__pte, end_level - 1); - else if (pg_size == IOMMU_PAGE_SIZE_2M) - iommu_free_pages(__pte); - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. - * If there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct amd_io_pgtable *pgtable, - unsigned long iova, unsigned long *page_size) -{ - u64 *pte; - int level; - - level = get_pgtable_level() - 1; - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; - /* Default page size is 4K */ - *page_size = PAGE_SIZE; - - while (level) { - /* Not present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Walk to the next level */ - pte = get_pgtable_pte(*pte); - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; - - /* Large page */ - if (is_large_pte(*pte)) { - if (level == PAGE_MODE_3_LEVEL) - *page_size = IOMMU_PAGE_SIZE_1G; - else if (level == PAGE_MODE_2_LEVEL) - *page_size = IOMMU_PAGE_SIZE_2M; - else - return NULL; /* Wrongly set PSE bit in PTE */ - - break; - } - - level -= 1; - } - - return pte; -} - -static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - u64 *pte; - unsigned long map_size; - unsigned long mapped_size = 0; - unsigned long o_iova = iova; - size_t size = pgcount << __ffs(pgsize); - int ret = 0; - bool updated = false; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) - return -EINVAL; - - if (!(prot & IOMMU_PROT_MASK)) - return -EINVAL; - - while (mapped_size < size) { - map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, - iova, map_size, gfp, &updated); - if (!pte) { - ret = -ENOMEM; - goto out; - } - - *pte = set_pte_attr(paddr, map_size, prot); - - iova += map_size; - paddr += map_size; - mapped_size += map_size; - } - -out: - if (updated) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - unsigned long flags; - - spin_lock_irqsave(&pdom->lock, flags); - amd_iommu_domain_flush_pages(pdom, o_iova, size); - spin_unlock_irqrestore(&pdom->lock, flags); - } - - if (mapped) - *mapped += mapped_size; - - return ret; -} - -static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, - unsigned long iova, - size_t pgsize, size_t pgcount, - struct iommu_iotlb_gather *gather) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; - unsigned long unmap_size; - unsigned long unmapped = 0; - size_t size = pgcount << __ffs(pgsize); - u64 *pte; - - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) - return 0; - - while (unmapped < size) { - pte = fetch_pte(pgtable, iova, &unmap_size); - if (!pte) - return unmapped; - - *pte = 0ULL; - - iova = (iova & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - return unmapped; -} - -static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) -{ - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - unsigned long offset_mask, pte_pgsize; - u64 *pte, __pte; - - pte = fetch_pte(pgtable, iova, &pte_pgsize); - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); -} - -/* - * ---------------------------------------------------- - */ -static void v2_free_pgtable(struct io_pgtable *iop) -{ - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); - - if (!pgtable || !pgtable->pgd) - return; - - /* Free page table */ - free_pgtable(pgtable->pgd, get_pgtable_level()); - pgtable->pgd = NULL; -} - -static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) -{ - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - int ias = IOMMU_IN_ADDR_BIT_SIZE; - - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); - if (!pgtable->pgd) - return NULL; - - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) - ias = 57; - - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - cfg->ias = ias; - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - - return &pgtable->pgtbl; -} - -struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { - .alloc = v2_alloc_pgtable, - .free = v2_free_pgtable, -}; diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 8841c1487f00..843fec8e8a51 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -28,10 +28,6 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, #endif -#ifdef CONFIG_AMD_IOMMU - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, -#endif }; static int check_custom_allocator(enum io_pgtable_fmt fmt, diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 8a823c6f2b4a..7a1516011ccf 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -15,8 +15,6 @@ enum io_pgtable_fmt { ARM_64_LPAE_S2, ARM_V7S, ARM_MALI_LPAE, - AMD_IOMMU_V1, - AMD_IOMMU_V2, APPLE_DART, APPLE_DART2, IO_PGTABLE_NUM_FMTS, -- cgit v1.2.3 From bc5233c0904eb116a4bd94e10cd3666733216063 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:13 -0400 Subject: iommupt: Add a kunit test for the IOMMU implementation This intends to have high coverage of the page table format functions and the IOMMU implementation itself, exercising the various corner cases. The kunit tests can be run in the kunit framework, using commands like: tools/testing/kunit/kunit.py run --build_dir build_kunit_arm64 --arch arm64 --make_options LLVM=-19 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_uml --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386 --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386pae --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig --kconfig_add CONFIG_X86_PAE=y There are several interesting corner cases on the 32 bit platforms that need checking. Like the generic tests, these are run on the format's configuration list using kunit "params". This also checks the core iommu parts of the page table code as it enters the logic through a mock iommu_domain. The following are checked: - PT_FEAT_DYNAMIC_TOP properly adds levels one by one - Every page size can be iommu_map()'d, and mapping creates that size - iommu_iova_to_phys() works with every page size - Test converting OA -> non present -> OA when the two OAs overlap and free table levels - Test that unmap stops at holes, unmap doesn't split, and unmap returns the right values for partial unmap requests - Randomly map/unmap. Checks map with random sizes, that map fails when hitting collisions doing nothing, unmap/map with random intersections and full unmap of random sizes. Also checks iommu_iova_to_phys() with random sizes - Check for memory leaks by monitoring NR_SECONDARY_PAGETABLE Reviewed-by: Kevin Tian Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/fmt/iommu_template.h | 1 + drivers/iommu/generic_pt/kunit_iommu.h | 2 + drivers/iommu/generic_pt/kunit_iommu_pt.h | 487 ++++++++++++++++++++++++++ include/linux/irqchip/riscv-imsic.h | 3 +- 4 files changed, 491 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/fmt/iommu_template.h b/drivers/iommu/generic_pt/fmt/iommu_template.h index 11e85106ae30..d28e86abdf2e 100644 --- a/drivers/iommu/generic_pt/fmt/iommu_template.h +++ b/drivers/iommu/generic_pt/fmt/iommu_template.h @@ -44,4 +44,5 @@ * which means we are building the kunit modle. */ #include "../kunit_generic_pt.h" +#include "../kunit_iommu_pt.h" #endif diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h index 28ec313f151e..d541235632aa 100644 --- a/drivers/iommu/generic_pt/kunit_iommu.h +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -71,6 +71,8 @@ struct kunit_iommu_priv { unsigned int largest_pgsz_lg2; pt_oaddr_t test_oa; pt_vaddr_t safe_pgsize_bitmap; + unsigned long orig_nr_secondary_pagetable; + }; PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h new file mode 100644 index 000000000000..e8a63c8ea850 --- /dev/null +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "kunit_iommu.h" +#include "pt_iter.h" +#include +#include + +static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len); + +struct count_valids { + u64 per_size[PT_VADDR_MAX_LG2]; +}; + +static int __count_valids(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct count_valids *valids = arg; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + pt_descend(&pts, arg, __count_valids); + continue; + } + if (pts.type == PT_ENTRY_OA) { + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; + continue; + } + } + return 0; +} + +/* + * Number of valid table entries. This counts contiguous entries as a single + * valid. + */ +static unsigned int count_valids(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) + total += valids.per_size[i]; + return total; +} + +/* Only a single page size is present, count the number of valid entries */ +static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + u64 total = 0; + unsigned int i; + + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { + if ((1ULL << i) == pgsz) + total = valids.per_size[i]; + else + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); + } + return total; +} + +static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + size_t ret; + + ret = iommu_unmap(&priv->domain, va, len); + KUNIT_ASSERT_EQ(test, ret, len); +} + +static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, + pt_vaddr_t len) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); + + for (; pfn != end_pfn; pfn++) { + phys_addr_t res = iommu_iova_to_phys(&priv->domain, + pfn * priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); + if (res != pa) + break; + pa += priv->smallest_pgsz; + } +} + +static void test_increase_level(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_common *common = priv->common; + + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); + + if (IS_32BIT) + kunit_skip(test, "Unable to test on 32bit"); + + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + + /* Add every possible level to the max */ + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { + struct pt_range top_range = pt_top_range(common); + + if (top_range.va == 0) + do_map(test, top_range.last_va + 1, 0, + priv->smallest_pgsz); + else + do_map(test, top_range.va - priv->smallest_pgsz, 0, + priv->smallest_pgsz); + + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, + top_range.top_level + 1); + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, + pt_top_range(common).max_vasz_lg2); + } +} + +static void test_map_simple(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range range = pt_top_range(priv->common); + struct count_valids valids = {}; + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + pt_vaddr_t cur_va; + + /* Map every reported page size */ + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + + cur_va = ALIGN(cur_va, len); + do_map(test, cur_va, paddr, len); + if (len <= SZ_2G) + check_iova(test, cur_va, paddr, len); + cur_va += len; + } + + /* The read interface reports that every page size was created */ + range = pt_top_range(priv->common); + KUNIT_ASSERT_NO_ERRNO(test, + pt_walk_range(&range, __count_valids, &valids)); + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + if (pgsize_bitmap & (1ULL << pgsz_lg2)) + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); + else + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); + } + + /* Unmap works */ + range = pt_top_range(priv->common); + cur_va = range.va + priv->smallest_pgsz * 256; + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + u64 len = log2_to_int(pgsz_lg2); + + if (!(pgsize_bitmap & len)) + continue; + cur_va = ALIGN(cur_va, len); + do_unmap(test, cur_va, len); + cur_va += len; + } + KUNIT_ASSERT_EQ(test, count_valids(test), 0); +} + +/* + * Test to convert a table pointer into an OA by mapping something small, + * unmapping it so as to leave behind a table pointer, then mapping something + * larger that will convert the table into an OA. + */ +static void test_map_table_to_oa(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + pt_vaddr_t limited_pgbitmap = + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); + struct pt_range range = pt_top_range(priv->common); + unsigned int pgsz_lg2; + pt_vaddr_t max_pgsize; + pt_vaddr_t cur_va; + + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); + u64 len = log2_to_int(pgsz_lg2); + pt_vaddr_t offset; + + if (!(priv->info.pgsize_bitmap & len)) + continue; + if (len > max_pgsize) + break; + + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, + max_pgsize); + for (offset = 0; offset != max_pgsize; offset += len) + do_map(test, cur_va + offset, paddr + offset, len); + check_iova(test, cur_va, paddr, max_pgsize); + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), + log2_div(max_pgsize, pgsz_lg2)); + + if (len == max_pgsize) { + do_unmap(test, cur_va, max_pgsize); + } else { + do_unmap(test, cur_va, max_pgsize / 2); + for (offset = max_pgsize / 2; offset != max_pgsize; + offset += len) + do_unmap(test, cur_va + offset, len); + } + + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + } +} + +/* + * Test unmapping a small page at the start of a large page. This always unmaps + * the large page. + */ +static void test_unmap_split(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; + unsigned int pgsz_lg2; + unsigned int count = 0; + + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); + unsigned int next_pgsz_lg2; + + if (!(pgsize_bitmap & base_len)) + continue; + + for (next_pgsz_lg2 = pgsz_lg2 + 1; + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); + pt_vaddr_t vaddr = top_range.va; + pt_oaddr_t paddr = 0; + size_t gnmapped; + + if (!(pgsize_bitmap & next_len)) + continue; + + do_map(test, vaddr, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + /* Make sure unmap doesn't keep going */ + do_map(test, vaddr, paddr, next_len); + do_map(test, vaddr + next_len, paddr, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, + next_len); + KUNIT_ASSERT_EQ(test, gnmapped, next_len); + + count++; + } + } + + if (count == 0) + kunit_skip(test, "Test needs two page sizes"); +} + +static void unmap_collisions(struct kunit *test, struct maple_tree *mt, + pt_vaddr_t start, pt_vaddr_t last) +{ + struct kunit_iommu_priv *priv = test->priv; + MA_STATE(mas, mt, start, last); + void *entry; + + mtree_lock(mt); + mas_for_each(&mas, entry, last) { + pt_vaddr_t mas_start = mas.index; + pt_vaddr_t len = (mas.last - mas_start) + 1; + pt_oaddr_t paddr; + + mas_erase(&mas); + mas_pause(&mas); + mtree_unlock(mt); + + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); + check_iova(test, mas_start, paddr, len); + do_unmap(test, mas_start, len); + mtree_lock(mt); + } + mtree_unlock(mt); +} + +static void clamp_range(struct kunit *test, struct pt_range *range) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (range->last_va - range->va > SZ_1G) + range->last_va = range->va + SZ_1G; + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); + if (range->va <= MAPLE_RESERVED_RANGE) + range->va = + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); +} + +/* + * Randomly map and unmap ranges that can large physical pages. If a random + * range overlaps with existing ranges then unmap them. This hits all the + * special cases. + */ +static void test_random_map(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range upper_range = pt_upper_range(priv->common); + struct pt_range top_range = pt_top_range(priv->common); + struct maple_tree mt; + unsigned int iter; + + mt_init(&mt); + + /* + * Shrink the range so randomization is more likely to have + * intersections + */ + clamp_range(test, &top_range); + clamp_range(test, &upper_range); + + for (iter = 0; iter != 1000; iter++) { + struct pt_range *range = &top_range; + pt_oaddr_t paddr; + pt_vaddr_t start; + pt_vaddr_t end; + int ret; + + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) + range = &upper_range; + + start = get_random_u32_below( + min(U32_MAX, range->last_va - range->va)); + end = get_random_u32_below( + min(U32_MAX, range->last_va - start)); + + start = ALIGN_DOWN(start, priv->smallest_pgsz); + end = ALIGN(end, priv->smallest_pgsz); + start += range->va; + end += start; + if (start < range->va || end > range->last_va + 1 || + start >= end) + continue; + + /* Try overmapping to test the failure handling */ + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); + ret = iommu_map(&priv->domain, start, paddr, end - start, + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); + if (ret) { + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); + unmap_collisions(test, &mt, start, end - 1); + do_map(test, start, paddr, end - start); + } + + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", + mtree_insert_range(&mt, start, end - 1, + XA_ZERO_ENTRY, + GFP_KERNEL)); + + check_iova(test, start, paddr, end - start); + if (iter % 100) + cond_resched(); + } + + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); + KUNIT_ASSERT_EQ(test, count_valids(test), 0); + + mtree_destroy(&mt); +} + +/* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ +static void test_pgsize_boundary(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || + priv->smallest_pgsz != SZ_4K) + kunit_skip(test, "Format does not have the required range"); + + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); +} + +/* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ +static void test_mixed(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + pt_oaddr_t oa = start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + do_map(test, start, oa, len); + /* 14 2M, 3 1G, 3 2M */ + KUNIT_ASSERT_EQ(test, count_valids(test), 20); + check_iova(test, start, oa, len); +} + +static struct kunit_case iommu_test_cases[] = { + KUNIT_CASE_FMT(test_increase_level), + KUNIT_CASE_FMT(test_map_simple), + KUNIT_CASE_FMT(test_map_table_to_oa), + KUNIT_CASE_FMT(test_unmap_split), + KUNIT_CASE_FMT(test_random_map), + KUNIT_CASE_FMT(test_pgsize_boundary), + KUNIT_CASE_FMT(test_mixed), + {}, +}; + +static int pt_kunit_iommu_init(struct kunit *test) +{ + struct kunit_iommu_priv *priv; + int ret; + + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->orig_nr_secondary_pagetable = + global_node_page_state(NR_SECONDARY_PAGETABLE); + ret = pt_kunit_priv_init(test, priv); + if (ret) { + kunit_kfree(test, priv); + return ret; + } + test->priv = priv; + return 0; +} + +static void pt_kunit_iommu_exit(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + + if (!test->priv) + return; + + pt_iommu_deinit(priv->iommu); + /* + * Look for memory leaks, assumes kunit is running isolated and nothing + * else is using secondary page tables. + */ + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, + global_node_page_state(NR_SECONDARY_PAGETABLE)); + kunit_kfree(test, test->priv); +} + +static struct kunit_suite NS(iommu_suite) = { + .name = __stringify(NS(iommu_test)), + .init = pt_kunit_iommu_init, + .exit = pt_kunit_iommu_exit, + .test_cases = iommu_test_cases, +}; +kunit_test_suites(&NS(iommu_suite)); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kunit for generic page table"); +MODULE_IMPORT_NS("GENERIC_PT_IOMMU"); diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7494952c5518..7f3ff5c5ea53 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -10,7 +10,6 @@ #include #include #include -#include #define IMSIC_MMIO_PAGE_SHIFT 12 #define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT) @@ -86,7 +85,7 @@ static inline const struct imsic_global_config *imsic_get_global_config(void) #endif -#ifdef CONFIG_ACPI +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC) int imsic_platform_acpi_probe(struct fwnode_handle *fwnode); struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev); #else -- cgit v1.2.3 From aefd967dab6469f5b827b59e50016a760dcc1fbc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:31 -0300 Subject: iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT This is the first step to supporting an incoherent walker, start and stop the incoherence around the allocation and frees of the page table memory. The iommu_pages API maps this to dma_map/unmap_single(), or arch cache flushing calls. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 89 ++++++++++++++++++++++++++-------- drivers/iommu/generic_pt/kunit_iommu.h | 1 + drivers/iommu/generic_pt/pt_defs.h | 5 +- include/linux/generic_pt/common.h | 6 +++ include/linux/generic_pt/iommu.h | 7 +++ 5 files changed, 88 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 142001f5aa83..2cad07da995a 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -24,6 +24,10 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, { struct pt_common *common = common_from_iommu(iommu_table); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(free_list, + iommu_table->iommu_device); + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); @@ -329,35 +333,55 @@ static int __collect_tables(struct pt_range *range, void *arg, return 0; } -static inline struct pt_table_p *table_alloc_top(struct pt_common *common, - uintptr_t top_of_table, - gfp_t gfp) +enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; + +/* Allocate a table, the empty table will be ready to be installed. */ +static inline struct pt_table_p *_table_alloc(struct pt_common *common, + size_t lg2sz, gfp_t gfp, + enum alloc_mode mode) { struct pt_iommu *iommu_table = iommu_from_common(common); + struct pt_table_p *table_mem; + + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, + log2_to_int(lg2sz)); + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + mode == ALLOC_NORMAL) { + int ret = iommu_pages_start_incoherent( + table_mem, iommu_table->iommu_device); + if (ret) { + iommu_free_pages(table_mem); + return ERR_PTR(ret); + } + } + return table_mem; +} +static inline struct pt_table_p *table_alloc_top(struct pt_common *common, + uintptr_t top_of_table, + gfp_t gfp, + enum alloc_mode mode) +{ /* * Top doesn't need the free list or otherwise, so it technically * doesn't need to use iommu pages. Use the API anyhow as the top is * usually not smaller than PAGE_SIZE to keep things simple. */ - return iommu_alloc_pages_node_sz( - iommu_table->nid, gfp, - log2_to_int(pt_top_memsize_lg2(common, top_of_table))); + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), + gfp, mode); } /* Allocate an interior table */ static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, - gfp_t gfp) + gfp_t gfp, enum alloc_mode mode) { - struct pt_iommu *iommu_table = - iommu_from_common(parent_pts->range->common); struct pt_state child_pts = pt_init(parent_pts->range, parent_pts->level - 1, NULL); - return iommu_alloc_pages_node_sz( - iommu_table->nid, gfp, - log2_to_int(pt_num_items_lg2(&child_pts) + - ilog2(PT_ITEM_WORD_SIZE))); + return _table_alloc(parent_pts->range->common, + pt_num_items_lg2(&child_pts) + + ilog2(PT_ITEM_WORD_SIZE), + gfp, mode); } static inline int pt_iommu_new_table(struct pt_state *pts, @@ -370,13 +394,15 @@ static inline int pt_iommu_new_table(struct pt_state *pts, if (PT_WARN_ON(!pt_can_have_table(pts))) return -ENXIO; - table_mem = table_alloc(pts, attrs->gfp); + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); phys = virt_to_phys(table_mem); if (!pt_install_table(pts, phys, attrs)) { - iommu_free_pages(table_mem); + iommu_pages_free_incoherent( + table_mem, + iommu_from_common(pts->range->common)->iommu_device); return -EAGAIN; } @@ -389,7 +415,9 @@ static inline int pt_iommu_new_table(struct pt_state *pts, pt_load_single_entry(pts); if (PT_WARN_ON(pt_table_pa(pts) != phys)) { pt_clear_entries(pts, ilog2(1)); - iommu_free_pages(table_mem); + iommu_pages_free_incoherent( + table_mem, iommu_from_common(pts->range->common) + ->iommu_device); return -EINVAL; } } @@ -615,8 +643,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, } new_level = pts.level; - table_mem = table_alloc_top( - common, _pt_top_set(NULL, pts.level), map->attrs.gfp); + table_mem = + table_alloc_top(common, _pt_top_set(NULL, pts.level), + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); iommu_pages_list_add(&free_list, table_mem); @@ -633,6 +662,16 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, new_top_of_table = _pt_top_set(pts.table, pts.level); } + /* + * Avoid double flushing, flush it once after all pt_install_table() + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = iommu_pages_start_incoherent_list( + &free_list, iommu_table->iommu_device); + if (ret) + goto err_free; + } + /* * top_of_table is write locked by the spinlock, but readers can use * READ_ONCE() to get the value. Since we encode both the level and the @@ -665,6 +704,9 @@ static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, return 0; err_free: + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&free_list, + iommu_table->iommu_device); iommu_put_pages_list(&free_list); return ret; } @@ -988,6 +1030,9 @@ static void NS(deinit)(struct pt_iommu *iommu_table) * The driver has to already have fenced the HW access to the page table * and invalidated any caching referring to this memory. */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + iommu_pages_stop_incoherent_list(&collect.free_list, + iommu_table->iommu_device); iommu_put_pages_list(&collect.free_list); } @@ -1078,6 +1123,7 @@ static void pt_iommu_zero(struct pt_iommu_table *fmt_table) memset_after(fmt_table, 0, iommu.domain); /* The caller can initialize some of these values */ + iommu_table->iommu_device = cfg.iommu_device; iommu_table->driver_ops = cfg.driver_ops; iommu_table->nid = cfg.nid; } @@ -1123,11 +1169,16 @@ int pt_iommu_init(struct pt_iommu_table *fmt_table, pt_feature(common, PT_FEAT_DYNAMIC_TOP))) return -EINVAL; + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && + WARN_ON(!iommu_table->iommu_device)) + return -EINVAL; + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); if (ret) return ret; - table_mem = table_alloc_top(common, common->top_of_table, gfp); + table_mem = table_alloc_top(common, common->top_of_table, gfp, + ALLOC_NORMAL); if (IS_ERR(table_mem)) return PTR_ERR(table_mem); pt_top_set(common, table_mem, pt_top_get_level(common)); diff --git a/drivers/iommu/generic_pt/kunit_iommu.h b/drivers/iommu/generic_pt/kunit_iommu.h index d541235632aa..5d4f269627d5 100644 --- a/drivers/iommu/generic_pt/kunit_iommu.h +++ b/drivers/iommu/generic_pt/kunit_iommu.h @@ -139,6 +139,7 @@ static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) priv->fmt_table.iommu.nid = NUMA_NO_NODE; priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; priv->domain.ops = &kunit_pt_ops; ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); if (ret) { diff --git a/drivers/iommu/generic_pt/pt_defs.h b/drivers/iommu/generic_pt/pt_defs.h index 819057de50d8..c25544d72f97 100644 --- a/drivers/iommu/generic_pt/pt_defs.h +++ b/drivers/iommu/generic_pt/pt_defs.h @@ -48,13 +48,16 @@ enum { /* * When in debug mode we compile all formats with all features. This allows the * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or - * FULL_VA. + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have */ #if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) enum { PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, PT_DEBUG_SUPPORTED_FEATURES = UINT_MAX & + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? + 0 : + BIT(PT_FEAT_DMA_INCOHERENT))) & ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : BIT(PT_FEAT_SIGN_EXTEND)), diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 96f8a6a7d60e..883069e32952 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -85,6 +85,12 @@ enum { * position. */ enum pt_features { + /** + * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before + * assuming the HW can read it. Otherwise a SMP release is sufficient + * for HW to read it. + */ + PT_FEAT_DMA_INCOHERENT, /** * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to * PT_VADDR_MAX. diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index fde7ccf007c5..21132e342a79 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -57,6 +57,13 @@ struct pt_iommu { * table walkers. */ int nid; + + /** + * @iommu_device: Device pointer used for any DMA cache flushing when + * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the + * page table which must have dma ops that perform cache flushing. + */ + struct device *iommu_device; }; /** -- cgit v1.2.3 From 5448c1558f60d4051c90938f2878c6fb20e2982a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:33 -0300 Subject: iommupt: Add the Intel VT-d second stage page table format The VT-d second stage format is almost the same as the x86 PAE format, except the bit encodings in the PTE are different and a few new PTE features, like force coherency are present. Among all the formats it is unique in not having a designated present bit. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 50,64 , 21.21 2^21, 59,70 , 56,67 , 16.16 2^30, 54,66 , 52,63 , 17.17 256*2^12, 384,524 , 337,516 , 34.34 256*2^21, 387,632 , 336,626 , 46.46 256*2^30, 376,629 , 323,623 , 48.48 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 67,86 , 63,84 , 25.25 2^21, 64,84 , 59,80 , 26.26 2^30, 59,78 , 56,74 , 24.24 256*2^12, 216,335 , 198,317 , 37.37 256*2^21, 245,350 , 232,344 , 32.32 256*2^30, 248,345 , 226,339 , 33.33 Cc: Tina Zhang Cc: Kevin Tian Cc: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 ++ drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_vtdss.h | 21 +++ drivers/iommu/generic_pt/fmt/iommu_vtdss.c | 10 + drivers/iommu/generic_pt/fmt/vtdss.h | 292 +++++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 18 ++ include/linux/generic_pt/iommu.h | 11 ++ 8 files changed, 366 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_vtdss.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_vtdss.c create mode 100644 drivers/iommu/generic_pt/fmt/vtdss.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index 2016c5e5ac0f..52ac9e661ffd 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -3,6 +3,7 @@ CONFIG_GENERIC_PT=y CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y +CONFIG_IOMMU_PT_VTDSS=y CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index 6dcb771b3c58..79f65268f312 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -42,6 +42,16 @@ config IOMMU_PT_AMDV1 Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_VTDSS + tristate "IOMMU page table for Intel VT-d Second Stage" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 + level Second Stage page table. It is similar to the X86_64 format with + 4K/2M/1G page sizes. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_X86_64 tristate "IOMMU page table for x86 64-bit, 4/5 levels" depends on !GENERIC_ATOMIC64 # for cmpxchg64 @@ -57,6 +67,7 @@ config IOMMU_PT_KUNIT_TEST depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS default KUNIT_ALL_TESTS help Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 5a3379107999..976b49ec97dc 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -3,6 +3,8 @@ iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock +iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 IOMMU_PT_KUNIT_TEST := diff --git a/drivers/iommu/generic_pt/fmt/defs_vtdss.h b/drivers/iommu/generic_pt/fmt/defs_vtdss.h new file mode 100644 index 000000000000..4a239bcaae2a --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_vtdss.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H +#define __GENERIC_PT_FMT_DEFS_VTDSS_H + +#include +#include + +typedef u64 pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct vtdss_pt_write_attrs { + u64 descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs vtdss_pt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_vtdss.c b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c new file mode 100644 index 000000000000..f551711e2a33 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_vtdss.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT vtdss +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/vtdss.h b/drivers/iommu/generic_pt/fmt/vtdss.h new file mode 100644 index 000000000000..d9774848eb6f --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/vtdss.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + * + * Intel VT-d Second Stange 5/4 level page table + * + * This is described in + * Section "3.7 Second-Stage Translation" + * Section "9.8 Second-Stage Paging Entries" + * + * Of the "Intel Virtualization Technology for Directed I/O Architecture + * Specification". + * + * The named levels in the spec map to the pts->level as: + * Table/SS-PTE - 0 + * Directory/SS-PDE - 1 + * Directory Ptr/SS-PDPTE - 2 + * PML4/SS-PML4E - 3 + * PML5/SS-PML5E - 4 + */ +#ifndef __GENERIC_PT_FMT_VTDSS_H +#define __GENERIC_PT_FMT_VTDSS_H + +#include "defs_vtdss.h" +#include "../pt_defs.h" + +#include +#include +#include + +enum { + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_ITEM_WORD_SIZE = sizeof(u64), + PT_MAX_TOP_LEVEL = 4, + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* SSPTPTR is 4k aligned and limited by HAW */ + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), +}; + +/* Shared descriptor bits */ +enum { + VTDSS_FMT_R = BIT(0), + VTDSS_FMT_W = BIT(1), + VTDSS_FMT_A = BIT(8), + VTDSS_FMT_D = BIT(9), + VTDSS_FMT_SNP = BIT(11), + VTDSS_FMT_OA = GENMASK_ULL(51, 12), +}; + +/* PDPTE/PDE */ +enum { + VTDSS_FMT_PS = BIT(7), +}; + +#define common_to_vtdss_pt(common_ptr) \ + container_of_const(common_ptr, struct pt_vtdss, common) +#define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) + +static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_TABLEMEM_LG2SZ); +} +#define pt_table_pa vtdss_pt_table_pa + +static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), + PT_GRANULE_LG2SZ); +} +#define pt_entry_oa vtdss_pt_entry_oa + +static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) +{ + return pts->level <= 2; +} +#define pt_can_have_leaf vtdss_pt_can_have_leaf + +static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 vtdss_pt_num_items_lg2 + +static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) +{ + const u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!entry) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw vtdss_pt_load_entry_raw + +static inline void +vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + u64 *tablep = pt_cur_table(pts, u64); + u64 entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + if (pts->level != 0) + entry |= VTDSS_FMT_PS; + + WRITE_ONCE(tablep[pts->index], entry); + pts->entry = entry; +} +#define pt_install_leaf_entry vtdss_pt_install_leaf_entry + +static inline bool vtdss_pt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + u64 entry; + + entry = VTDSS_FMT_R | VTDSS_FMT_W | + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table vtdss_pt_install_table + +static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = pts->entry & + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); +} +#define pt_attr_from_entry vtdss_pt_attr_from_entry + +static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + return READ_ONCE(*tablep) & VTDSS_FMT_D; +} +#define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty + +static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); +} +#define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean + +static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) +{ + u64 *tablep = pt_cur_table(pts, u64) + pts->index; + u64 new = pts->entry | VTDSS_FMT_D; + + return try_cmpxchg64(tablep, &pts->entry, new); +} +#define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty + +static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) +{ + return 10; +} +#define pt_max_sw_bit vtdss_pt_max_sw_bit + +static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) +{ + /* Bits marked Ignored in the specification */ + switch (bitnr) { + case 0: + return BIT(10); + case 1 ... 9: + return BIT_ULL((bitnr - 1) + 52); + case 10: + return BIT_ULL(63); + /* Some bits in 9-3 are available in some entries */ + default: + if (__builtin_constant_p(bitnr)) + BUILD_BUG(); + else + PT_WARN_ON(true); + return 0; + } +} +#define pt_sw_bit vtdss_pt_sw_bit + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_vtdss + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->vtdss_pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) + ->iommu; +} + +static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte = 0; + + /* + * VTDSS does not have a present bit, so we tell if any entry is present + * by checking for R or W. + */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return -EINVAL; + + if (iommu_prot & IOMMU_READ) + pte |= VTDSS_FMT_R; + if (iommu_prot & IOMMU_WRITE) + pte |= VTDSS_FMT_W; + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) + pte |= VTDSS_FMT_SNP; + + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && + !(iommu_prot & IOMMU_WRITE)) { + pr_err_ratelimited( + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot vtdss_pt_iommu_set_prot + +static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, + const struct pt_iommu_vtdss_cfg *cfg) +{ + struct pt_vtdss *table = &iommu_table->vtdss_pt; + unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2; + + if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2) + return -EOPNOTSUPP; + else if (vasz_lg2 > 48) + pt_top_set_level(&table->common, 4); + else if (vasz_lg2 > 39) + pt_top_set_level(&table->common, 3); + else if (vasz_lg2 > 30) + pt_top_set_level(&table->common, 2); + else + return -EOPNOTSUPP; + return 0; +} +#define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init + +static inline void +vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, + const struct pt_range *top_range, + struct pt_iommu_vtdss_hw_info *info) +{ + info->ssptptr = virt_to_phys(top_range->top_table); + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); + /* + * top_level = 2 = 3 level table aw=1 + * top_level = 3 = 4 level table aw=2 + * top_level = 4 = 5 level table aw=3 + */ + info->aw = top_range->top_level - 1; +} +#define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { + [0] = { .common.hw_max_vasz_lg2 = 39 }, + [1] = { .common.hw_max_vasz_lg2 = 48 }, + [2] = { .common.hw_max_vasz_lg2 = 57 }, +}; +#define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs +enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; +#endif +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 883069e32952..6a9a1acb5aad 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -157,6 +157,24 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_vtdss { + struct pt_common common; +}; + +enum { + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, + /* + * Prevent creating read-only PTEs. Used to work around HW errata + * ERRATA_772415_SPR17. + */ + PT_FEAT_VTDSS_FORCE_WRITEABLE, +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 21132e342a79..cfe05a77f86b 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_vtdss_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_vtdss_hw_info { + u64 ssptptr; + u8 aw; +}; + +IOMMU_FORMAT(vtdss, vtdss_pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; }; -- cgit v1.2.3 From ae83f3b72621bd3187eb7956c7c2993a97d4b187 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 9 Oct 2025 20:06:09 -0700 Subject: module: Add compile-time check for embedded NUL characters Long ago, the kernel module license checks were bypassed by embedding a NUL character in the MODULE_LICENSE() string[1]. By using a string like "GPL\0proprietary text", the kernel would only read "GPL" due to C string termination at the NUL byte, allowing proprietary modules to avoid kernel tainting and access GPL-only symbols. The MODULE_INFO() macro stores these strings in the .modinfo ELF section, and get_next_modinfo() uses strcmp()-family functions which stop at the first NUL. This split the embedded string into two separate .modinfo entries, with only the first part being processed by license_is_gpl_compatible(). Add a compile-time check using static_assert that compares the full string length (sizeof - 1) against __builtin_strlen(), which stops at the first NUL. If they differ, compilation fails with a clear error message. While this check can still be circumvented by modifying the ELF binary post-compilation, it prevents accidental embedded NULs and forces intentional abuse to require deliberate binary manipulation rather than simple source-level tricks. Build tested with test modules containing both valid and invalid license strings. The check correctly rejects: MODULE_LICENSE("GPL\0proprietary") while accepting normal declarations: MODULE_LICENSE("GPL") Link: https://lwn.net/Articles/82305/ [1] Suggested-by: Rusty Russell Signed-off-by: Kees Cook Reviewed-by: Daniel Gomez Reviewed-by: Aaron Tomlin Reviewed-by: Petr Pavlu Tested-by: Daniel Gomez Signed-off-by: Daniel Gomez --- include/linux/moduleparam.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 6907aedc4f74..915f32f7d888 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -26,6 +26,9 @@ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) \ + static_assert( \ + sizeof(info) - 1 == __builtin_strlen(info), \ + "MODULE_INFO(" #tag ", ...) contains embedded NUL byte"); \ static const char __UNIQUE_ID(modinfo)[] \ __used __section(".modinfo") __aligned(1) \ = __MODULE_INFO_PREFIX __stringify(tag) "=" info -- cgit v1.2.3 From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:24 -0700 Subject: KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in anticipation of using the API on x86. Once x86 uses the API, providing a stub for one architecture and having all other architectures opt-in requires more code than simply implementing the API in the lone holdout. Eliminating the Kconfig will also reduce churn if the API is renamed in the future (spoiler alert). No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 ++++++ arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 10 ---------- virt/kvm/Kconfig | 3 --- 9 files changed, 12 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..ef5bf57f79b7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..ca5ba2caf314 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,6 +7240,12 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..7186b2ae4b57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool -- cgit v1.2.3 From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:25 -0700 Subject: KVM: Rename kvm_arch_vcpu_async_ioctl() to kvm_arch_vcpu_unlocked_ioctl() Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's TDX code doesn't result in a massive misnomer. To avoid having to retry SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and acquiring all of those locks after/inside the current vCPU's mutex is a non-starter. However, TDX also needs to acquire the vCPU's mutex and load the vCPU, i.e. the handling is very much not async to the vCPU. No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 4 ++-- arch/loongarch/kvm/vcpu.c | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/riscv/kvm/vcpu.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 9 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ef5bf57f79b7..cf23f6b07ec7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,8 +1835,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..9a5844e85fd3 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1471,8 +1471,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..a4bd6077eecc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca5ba2caf314..b85cb213a336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,8 +7240,8 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7186b2ae4b57..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..b7db1d5f71a8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; -- cgit v1.2.3 From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/crypto/hisilicon/qm.c | 27 +++++++++++++++++++++++++++ include/linux/hisi_acc_qm.h | 3 +++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index a5b96adf2d1e..e88085c2cbb3 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3019,11 +3019,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5725,6 +5750,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5863,6 +5889,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c4690e365ade..ca1ec437a3ca 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ -- cgit v1.2.3 From 6a571d762cda6c25517c5533b8bd06d56028cdcb Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:05 +0530 Subject: soc: qcom: socinfo: Add support for new fields in revision 20 Add support for socinfo version 20. Version 20 adds a new field package id and its zeroth bit contain information that can be can be used to tune temperature thresholds on devices which might be able to withstand higher temperatures. Zeroth bit value 1 means that its heat dissipation is better and more relaxed thermal scheme can be put in place and 0 means a more aggressive scheme may be needed. Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-1-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/socinfo.c | 6 ++++++ include/linux/soc/qcom/socinfo.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index af0188bd3880..37567f5492fa 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -213,6 +213,7 @@ struct socinfo_params { u32 num_func_clusters; u32 boot_cluster; u32 boot_core; + u32 raw_package_type; }; struct smem_image_version { @@ -675,6 +676,11 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo, &qcom_socinfo->info.fmt); switch (qcom_socinfo->info.fmt) { + case SOCINFO_VERSION(0, 20): + qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type); + debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root, + &qcom_socinfo->info.raw_package_type); + fallthrough; case SOCINFO_VERSION(0, 19): qcom_socinfo->info.num_func_clusters = __le32_to_cpu(info->num_func_clusters); qcom_socinfo->info.boot_cluster = __le32_to_cpu(info->boot_cluster); diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 608950443eee..c4dae173cc30 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -82,6 +82,8 @@ struct socinfo { __le32 num_func_clusters; __le32 boot_cluster; __le32 boot_core; + /* Version 20 */ + __le32 raw_package_type; }; /* Internal feature codes */ -- cgit v1.2.3 From 6918667af5a7315eff3c56d871be4c5439f7f9d2 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:06 +0530 Subject: soc: qcom: socinfo: Add reserve field to support future extension Some of the new field added to socinfo structure with version 21, 22 and 23 which is only used by boot firmware and it is of no use for Linux.Add reserve field in socinfo so that the structure remain updated and prepared if we get any new field in future which could be used by Linux. While at it, also updates switch case for backward compatibility if the SoC runs with boot firmware which has these new version added. Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/socinfo.c | 3 +++ include/linux/soc/qcom/socinfo.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 37567f5492fa..003a2304d535 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -676,6 +676,9 @@ static void socinfo_debugfs_init(struct qcom_socinfo *qcom_socinfo, &qcom_socinfo->info.fmt); switch (qcom_socinfo->info.fmt) { + case SOCINFO_VERSION(0, 23): + case SOCINFO_VERSION(0, 22): + case SOCINFO_VERSION(0, 21): case SOCINFO_VERSION(0, 20): qcom_socinfo->info.raw_package_type = __le32_to_cpu(info->raw_package_type); debugfs_create_u32("raw_package_type", 0444, qcom_socinfo->dbg_root, diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index c4dae173cc30..ba823a0013c5 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -84,6 +84,8 @@ struct socinfo { __le32 boot_core; /* Version 20 */ __le32 raw_package_type; + /* Version 21, 22, 23 */ + __le32 reserve1[4]; }; /* Internal feature codes */ -- cgit v1.2.3 From bf9f0b00bb7fd0470c1255bcc8e76c81d122a609 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Wed, 29 Oct 2025 16:00:08 +0530 Subject: include: linux: Destage VCHIQ interface headers Move the VCHIQ headers from drivers/staging/vc04_services/include to include/linux/raspberrypi This is done so that they can be shared between the VCHIQ interface (which is going to be de-staged in a subsequent commit from staging) and the VCHIQ drivers left in the staging/vc04_services (namely bcm2835-audio, bcm2835-camera). The include/linux/raspberrypi/ provides a central location to serve both of these areas. Co-developed-by: Umang Jain Signed-off-by: Umang Jain Reviewed-by: Laurent Pinchart Signed-off-by: Jai Luthra Link: https://patch.msgid.link/20251029-vchiq-destage-v3-4-da8d6c83c2c5@ideasonboard.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + .../vc04_services/bcm2835-audio/bcm2835-vchiq.c | 5 +- .../staging/vc04_services/bcm2835-audio/bcm2835.c | 3 +- .../staging/vc04_services/bcm2835-audio/bcm2835.h | 3 +- .../include/linux/raspberrypi/vchiq.h | 112 ---- .../vc04_services/interface/vchiq_arm/vchiq_arm.c | 9 +- .../vc04_services/interface/vchiq_arm/vchiq_arm.h | 164 ------ .../vc04_services/interface/vchiq_arm/vchiq_bus.c | 4 +- .../vc04_services/interface/vchiq_arm/vchiq_bus.h | 60 -- .../vc04_services/interface/vchiq_arm/vchiq_cfg.h | 41 -- .../vc04_services/interface/vchiq_arm/vchiq_core.c | 4 +- .../vc04_services/interface/vchiq_arm/vchiq_core.h | 646 --------------------- .../interface/vchiq_arm/vchiq_debugfs.c | 6 +- .../interface/vchiq_arm/vchiq_debugfs.h | 22 - .../vc04_services/interface/vchiq_arm/vchiq_dev.c | 7 +- .../interface/vchiq_arm/vchiq_ioctl.h | 3 +- .../staging/vc04_services/vchiq-mmal/mmal-vchiq.c | 5 +- include/linux/raspberrypi/vchiq.h | 112 ++++ include/linux/raspberrypi/vchiq_arm.h | 164 ++++++ include/linux/raspberrypi/vchiq_bus.h | 60 ++ include/linux/raspberrypi/vchiq_cfg.h | 41 ++ include/linux/raspberrypi/vchiq_core.h | 646 +++++++++++++++++++++ include/linux/raspberrypi/vchiq_debugfs.h | 22 + 23 files changed, 1072 insertions(+), 1068 deletions(-) delete mode 100644 drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h delete mode 100644 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h create mode 100644 include/linux/raspberrypi/vchiq.h create mode 100644 include/linux/raspberrypi/vchiq_arm.h create mode 100644 include/linux/raspberrypi/vchiq_bus.h create mode 100644 include/linux/raspberrypi/vchiq_cfg.h create mode 100644 include/linux/raspberrypi/vchiq_core.h create mode 100644 include/linux/raspberrypi/vchiq_debugfs.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 3da2c26a796b..cd223e119d48 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4829,6 +4829,7 @@ T: git https://github.com/broadcom/stblinux.git F: Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml F: drivers/pci/controller/pcie-brcmstb.c F: drivers/staging/vc04_services +F: include/linux/raspberrypi/vchiq* N: bcm2711 N: bcm2712 N: bcm283* diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c index 0dbe76ee5570..7368b384497f 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835-vchiq.c @@ -4,11 +4,12 @@ #include #include #include + +#include + #include "bcm2835.h" #include "vc_vchi_audioserv_defs.h" -#include "../interface/vchiq_arm/vchiq_arm.h" - struct bcm2835_audio_instance { struct device *dev; unsigned int service_handle; diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c index b74cb104e9de..f292a6618166 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.c @@ -6,7 +6,8 @@ #include #include -#include "../interface/vchiq_arm/vchiq_bus.h" +#include + #include "bcm2835.h" static bool enable_hdmi; diff --git a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h index 49ec5b496edb..5a1348747ff4 100644 --- a/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h +++ b/drivers/staging/vc04_services/bcm2835-audio/bcm2835.h @@ -5,13 +5,12 @@ #define __SOUND_ARM_BCM2835_H #include +#include #include #include #include #include -#include "../include/linux/raspberrypi/vchiq.h" - #define MAX_SUBSTREAMS (8) #define AVAIL_SUBSTREAMS_MASK (0xff) diff --git a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h b/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h deleted file mode 100644 index ee4469f4fc51..000000000000 --- a/drivers/staging/vc04_services/include/linux/raspberrypi/vchiq.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_H -#define VCHIQ_H - -#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ - (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) - -enum vchiq_reason { - VCHIQ_SERVICE_OPENED, /* service, -, - */ - VCHIQ_SERVICE_CLOSED, /* service, -, - */ - VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ - VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ - VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ - VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ - VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ -}; - -enum vchiq_bulk_mode { - VCHIQ_BULK_MODE_CALLBACK, - VCHIQ_BULK_MODE_BLOCKING, - VCHIQ_BULK_MODE_NOCALLBACK, - VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ -}; - -enum vchiq_service_option { - VCHIQ_SERVICE_OPTION_AUTOCLOSE, - VCHIQ_SERVICE_OPTION_SLOT_QUOTA, - VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, - VCHIQ_SERVICE_OPTION_SYNCHRONOUS, - VCHIQ_SERVICE_OPTION_TRACE -}; - -struct vchiq_header { - /* The message identifier - opaque to applications. */ - int msgid; - - /* Size of message data. */ - unsigned int size; - - char data[]; /* message */ -}; - -struct vchiq_element { - const void __user *data; - unsigned int size; -}; - -struct vchiq_instance; -struct vchiq_state; - -struct vchiq_service_base { - int fourcc; - int (*callback)(struct vchiq_instance *instance, - enum vchiq_reason reason, - struct vchiq_header *header, - unsigned int handle, - void *cb_data, void __user *cb_userdata); - void *userdata; -}; - -struct vchiq_completion_data_kernel { - enum vchiq_reason reason; - struct vchiq_header *header; - void *service_userdata; - void *cb_data; - void __user *cb_userdata; -}; - -struct vchiq_service_params_kernel { - int fourcc; - int (*callback)(struct vchiq_instance *instance, - enum vchiq_reason reason, - struct vchiq_header *header, - unsigned int handle, - void *cb_data, void __user *cb_userdata); - void *userdata; - short version; /* Increment for non-trivial changes */ - short version_min; /* Update for incompatible changes */ -}; - -extern int vchiq_initialise(struct vchiq_state *state, - struct vchiq_instance **pinstance); -extern int vchiq_shutdown(struct vchiq_instance *instance); -extern int vchiq_connect(struct vchiq_instance *instance); -extern int vchiq_open_service(struct vchiq_instance *instance, - const struct vchiq_service_params_kernel *params, - unsigned int *pservice); -extern int vchiq_close_service(struct vchiq_instance *instance, - unsigned int service); -extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service); -extern int vchiq_release_service(struct vchiq_instance *instance, - unsigned int service); -extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_header *header); -extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service, - struct vchiq_header *header); -extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle, - void *data, unsigned int size); -extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service, - const void *data, unsigned int size, void *userdata, - enum vchiq_bulk_mode mode); -extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service, - void *data, unsigned int size, void *userdata, - enum vchiq_bulk_mode mode); -extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service); -extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle, - short *peer_version); -extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle); - -#endif /* VCHIQ_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index a2074069e79e..6a7b96d3dae6 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -30,11 +30,12 @@ #include #include -#include "vchiq_core.h" +#include +#include +#include +#include + #include "vchiq_ioctl.h" -#include "vchiq_arm.h" -#include "vchiq_bus.h" -#include "vchiq_debugfs.h" #define DEVICE_NAME "vchiq" diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h deleted file mode 100644 index e32b02f99024..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.h +++ /dev/null @@ -1,164 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. - * Copyright (c) 2010-2012 Broadcom. All rights reserved. - */ - -#ifndef VCHIQ_ARM_H -#define VCHIQ_ARM_H - -#include -#include -#include -#include -#include "vchiq_core.h" -#include "vchiq_debugfs.h" - -/* Some per-instance constants */ -#define MAX_COMPLETIONS 128 -#define MAX_SERVICES 64 -#define MAX_ELEMENTS 8 -#define MSG_QUEUE_SIZE 128 - -#define VCHIQ_DRV_MAX_CALLBACKS 10 - -struct rpi_firmware; -struct vchiq_device; - -enum USE_TYPE_E { - USE_TYPE_SERVICE, - USE_TYPE_VCHIQ -}; - -struct vchiq_platform_info { - unsigned int cache_line_size; -}; - -struct vchiq_drv_mgmt { - struct rpi_firmware *fw; - const struct vchiq_platform_info *info; - - bool connected; - int num_deferred_callbacks; - /* Protects connected and num_deferred_callbacks */ - struct mutex connected_mutex; - - void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void); - - struct semaphore free_fragments_sema; - struct semaphore free_fragments_mutex; - char *fragments_base; - char *free_fragments; - unsigned int fragments_size; - - void __iomem *regs; - - struct vchiq_state state; -}; - -struct user_service { - struct vchiq_service *service; - void __user *userdata; - struct vchiq_instance *instance; - char is_vchi; - char dequeue_pending; - char close_pending; - int message_available_pos; - int msg_insert; - int msg_remove; - struct completion insert_event; - struct completion remove_event; - struct completion close_event; - struct vchiq_header *msg_queue[MSG_QUEUE_SIZE]; -}; - -struct bulk_waiter_node { - struct bulk_waiter bulk_waiter; - int pid; - struct list_head list; -}; - -struct vchiq_instance { - struct vchiq_state *state; - struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS]; - int completion_insert; - int completion_remove; - struct completion insert_event; - struct completion remove_event; - struct mutex completion_mutex; - - int connected; - int closing; - int pid; - int mark; - int use_close_delivered; - int trace; - - struct list_head bulk_waiter_list; - struct mutex bulk_waiter_list_mutex; - - struct vchiq_debugfs_node debugfs_node; -}; - -int -vchiq_use_service(struct vchiq_instance *instance, unsigned int handle); - -extern int -vchiq_release_service(struct vchiq_instance *instance, unsigned int handle); - -extern int -vchiq_check_service(struct vchiq_service *service); - -extern void -vchiq_dump_service_use_state(struct vchiq_state *state); - -extern int -vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service, - enum USE_TYPE_E use_type); -extern int -vchiq_release_internal(struct vchiq_state *state, - struct vchiq_service *service); - -extern struct vchiq_debugfs_node * -vchiq_instance_get_debugfs_node(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_use_count(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_pid(struct vchiq_instance *instance); - -extern int -vchiq_instance_get_trace(struct vchiq_instance *instance); - -extern void -vchiq_instance_set_trace(struct vchiq_instance *instance, int trace); - -extern void -vchiq_add_connected_callback(struct vchiq_device *device, - void (*callback)(void)); - -#if IS_ENABLED(CONFIG_VCHIQ_CDEV) - -extern void -vchiq_deregister_chrdev(void); - -extern int -vchiq_register_chrdev(struct device *parent); - -#else - -static inline void vchiq_deregister_chrdev(void) { } -static inline int vchiq_register_chrdev(struct device *parent) { return 0; } - -#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */ - -extern int -service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason, - struct vchiq_header *header, unsigned int handle, - void *cb_data, void __user *cb_userdata); - -extern void -free_bulk_waiter(struct vchiq_instance *instance); - -#endif /* VCHIQ_ARM_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c index 41ece91ab88a..f50e637d505c 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.c @@ -11,8 +11,8 @@ #include #include -#include "vchiq_arm.h" -#include "vchiq_bus.h" +#include +#include static int vchiq_bus_type_match(struct device *dev, const struct device_driver *drv) { diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h deleted file mode 100644 index 9de179b39f85..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_bus.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2023 Ideas On Board Oy - */ - -#ifndef _VCHIQ_DEVICE_H -#define _VCHIQ_DEVICE_H - -#include -#include - -struct vchiq_drv_mgmt; - -struct vchiq_device { - struct device dev; - struct vchiq_drv_mgmt *drv_mgmt; -}; - -struct vchiq_driver { - int (*probe)(struct vchiq_device *device); - void (*remove)(struct vchiq_device *device); - int (*resume)(struct vchiq_device *device); - int (*suspend)(struct vchiq_device *device, - pm_message_t state); - - const struct vchiq_device_id *id_table; - struct device_driver driver; -}; - -static inline struct vchiq_device *to_vchiq_device(struct device *d) -{ - return container_of(d, struct vchiq_device, dev); -} - -static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d) -{ - return container_of(d, struct vchiq_driver, driver); -} - -extern const struct bus_type vchiq_bus_type; - -struct vchiq_device * -vchiq_device_register(struct device *parent, const char *name); -void vchiq_device_unregister(struct vchiq_device *dev); - -int vchiq_driver_register(struct vchiq_driver *vchiq_drv); -void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv); - -/** - * module_vchiq_driver() - Helper macro for registering a vchiq driver - * @__vchiq_driver: vchiq driver struct - * - * Helper macro for vchiq drivers which do not do anything special in - * module init/exit. This eliminates a lot of boilerplate. Each module may only - * use this macro once, and calling it replaces module_init() and module_exit() - */ -#define module_vchiq_driver(__vchiq_driver) \ - module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister) - -#endif /* _VCHIQ_DEVICE_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h deleted file mode 100644 index a16d0299996c..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_cfg.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_CFG_H -#define VCHIQ_CFG_H - -#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') -/* The version of VCHIQ - change with any non-trivial change */ -#define VCHIQ_VERSION 8 -/* - * The minimum compatible version - update to match VCHIQ_VERSION with any - * incompatible change - */ -#define VCHIQ_VERSION_MIN 3 - -/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */ -#define VCHIQ_VERSION_LIB_VERSION 7 - -/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */ -#define VCHIQ_VERSION_CLOSE_DELIVERED 7 - -/* The version that made it safe to use SYNCHRONOUS mode */ -#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8 - -#define VCHIQ_MAX_STATES 1 -#define VCHIQ_MAX_SERVICES 4096 -#define VCHIQ_MAX_SLOTS 128 -#define VCHIQ_MAX_SLOTS_PER_SIDE 64 - -#define VCHIQ_NUM_CURRENT_BULKS 32 -#define VCHIQ_NUM_SERVICE_BULKS 4 - -#ifndef VCHIQ_ENABLE_DEBUG -#define VCHIQ_ENABLE_DEBUG 1 -#endif - -#ifndef VCHIQ_ENABLE_STATS -#define VCHIQ_ENABLE_STATS 1 -#endif - -#endif /* VCHIQ_CFG_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c index 130be2f58342..83de27cfd469 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c @@ -15,8 +15,8 @@ #include #include -#include "vchiq_arm.h" -#include "vchiq_core.h" +#include +#include #define VCHIQ_SLOT_HANDLER_STACK 8192 diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h deleted file mode 100644 index e3ed50d26c37..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.h +++ /dev/null @@ -1,646 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ - -#ifndef VCHIQ_CORE_H -#define VCHIQ_CORE_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../include/linux/raspberrypi/vchiq.h" -#include "vchiq_cfg.h" - -/* Do this so that we can test-build the code on non-rpi systems */ -#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) - -#else - -#ifndef dsb -#define dsb(a) -#endif - -#endif /* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */ - -#define VCHIQ_SERVICE_HANDLE_INVALID 0 - -#define VCHIQ_SLOT_SIZE 4096 -#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header)) - -#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) -#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) -#define VCHIQ_SLOT_ZERO_SLOTS DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \ - VCHIQ_SLOT_SIZE) - -#define BITSET_SIZE(b) ((b + 31) >> 5) -#define BITSET_WORD(b) (b >> 5) -#define BITSET_BIT(b) (1 << (b & 31)) -#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) -#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) - -enum { - DEBUG_ENTRIES, -#if VCHIQ_ENABLE_DEBUG - DEBUG_SLOT_HANDLER_COUNT, - DEBUG_SLOT_HANDLER_LINE, - DEBUG_PARSE_LINE, - DEBUG_PARSE_HEADER, - DEBUG_PARSE_MSGID, - DEBUG_AWAIT_COMPLETION_LINE, - DEBUG_DEQUEUE_MESSAGE_LINE, - DEBUG_SERVICE_CALLBACK_LINE, - DEBUG_MSG_QUEUE_FULL_COUNT, - DEBUG_COMPLETION_QUEUE_FULL_COUNT, -#endif - DEBUG_MAX -}; - -#if VCHIQ_ENABLE_DEBUG - -#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug -#define DEBUG_TRACE(d) \ - do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0) -#define DEBUG_VALUE(d, v) \ - do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0) -#define DEBUG_COUNT(d) \ - do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0) - -#else /* VCHIQ_ENABLE_DEBUG */ - -#define DEBUG_INITIALISE(local) -#define DEBUG_TRACE(d) -#define DEBUG_VALUE(d, v) -#define DEBUG_COUNT(d) - -#endif /* VCHIQ_ENABLE_DEBUG */ - -enum vchiq_connstate { - VCHIQ_CONNSTATE_DISCONNECTED, - VCHIQ_CONNSTATE_CONNECTING, - VCHIQ_CONNSTATE_CONNECTED, - VCHIQ_CONNSTATE_PAUSING, - VCHIQ_CONNSTATE_PAUSE_SENT, - VCHIQ_CONNSTATE_PAUSED, - VCHIQ_CONNSTATE_RESUMING, - VCHIQ_CONNSTATE_PAUSE_TIMEOUT, - VCHIQ_CONNSTATE_RESUME_TIMEOUT -}; - -enum { - VCHIQ_SRVSTATE_FREE, - VCHIQ_SRVSTATE_HIDDEN, - VCHIQ_SRVSTATE_LISTENING, - VCHIQ_SRVSTATE_OPENING, - VCHIQ_SRVSTATE_OPEN, - VCHIQ_SRVSTATE_OPENSYNC, - VCHIQ_SRVSTATE_CLOSESENT, - VCHIQ_SRVSTATE_CLOSERECVD, - VCHIQ_SRVSTATE_CLOSEWAIT, - VCHIQ_SRVSTATE_CLOSED -}; - -enum vchiq_bulk_dir { - VCHIQ_BULK_TRANSMIT, - VCHIQ_BULK_RECEIVE -}; - -struct vchiq_bulk { - short mode; - short dir; - void *cb_data; - void __user *cb_userdata; - struct bulk_waiter *waiter; - dma_addr_t dma_addr; - int size; - void *remote_data; - int remote_size; - int actual; - void *offset; - void __user *uoffset; -}; - -struct vchiq_bulk_queue { - int local_insert; /* Where to insert the next local bulk */ - int remote_insert; /* Where to insert the next remote bulk (master) */ - int process; /* Bulk to transfer next */ - int remote_notify; /* Bulk to notify the remote client of next (mstr) */ - int remove; /* Bulk to notify the local client of, and remove, next */ - struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS]; -}; - -/* - * Remote events provide a way of presenting several virtual doorbells to a - * peer (ARM host to VPU) using only one physical doorbell. They can be thought - * of as a way for the peer to signal a semaphore, in this case implemented as - * a workqueue. - * - * Remote events remain signalled until acknowledged by the receiver, and they - * are non-counting. They are designed in such a way as to minimise the number - * of interrupts and avoid unnecessary waiting. - * - * A remote_event is as small data structures that live in shared memory. It - * comprises two booleans - armed and fired: - * - * The sender sets fired when they signal the receiver. - * If fired is set, the receiver has been signalled and need not wait. - * The receiver sets the armed field before they begin to wait. - * If armed is set, the receiver is waiting and wishes to be woken by interrupt. - */ -struct remote_event { - int armed; - int fired; - u32 __unused; -}; - -struct opaque_platform_state; - -struct vchiq_slot { - char data[VCHIQ_SLOT_SIZE]; -}; - -struct vchiq_slot_info { - /* Use two counters rather than one to avoid the need for a mutex. */ - short use_count; - short release_count; -}; - -/* - * VCHIQ is a reliable connection-oriented datagram protocol. - * - * A VCHIQ service is equivalent to a TCP connection, except: - * + FOURCCs are used for the rendezvous, and port numbers are assigned at the - * time the connection is established. - * + There is less of a distinction between server and client sockets, the only - * difference being which end makes the first move. - * + For a multi-client server, the server creates new "listening" services as - * the existing one becomes connected - there is no need to specify the - * maximum number of clients up front. - * + Data transfer is reliable but packetized (messages have defined ends). - * + Messages can be either short (capable of fitting in a slot) and in-band, - * or copied between external buffers (bulk transfers). - */ -struct vchiq_service { - struct vchiq_service_base base; - unsigned int handle; - struct kref ref_count; - struct rcu_head rcu; - int srvstate; - void (*userdata_term)(void *userdata); - unsigned int localport; - unsigned int remoteport; - int public_fourcc; - int client_id; - char auto_close; - char sync; - char closing; - char trace; - atomic_t poll_flags; - short version; - short version_min; - short peer_version; - - struct vchiq_state *state; - struct vchiq_instance *instance; - - int service_use_count; - - struct vchiq_bulk_queue bulk_tx; - struct vchiq_bulk_queue bulk_rx; - - struct completion remove_event; - struct completion bulk_remove_event; - struct mutex bulk_mutex; - - struct service_stats_struct { - int quota_stalls; - int slot_stalls; - int bulk_stalls; - int error_count; - int ctrl_tx_count; - int ctrl_rx_count; - int bulk_tx_count; - int bulk_rx_count; - int bulk_aborted_count; - u64 ctrl_tx_bytes; - u64 ctrl_rx_bytes; - u64 bulk_tx_bytes; - u64 bulk_rx_bytes; - } stats; - - int msg_queue_read; - int msg_queue_write; - struct completion msg_queue_pop; - struct completion msg_queue_push; - struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS]; -}; - -/* - * The quota information is outside struct vchiq_service so that it can - * be statically allocated, since for accounting reasons a service's slot - * usage is carried over between users of the same port number. - */ -struct vchiq_service_quota { - unsigned short slot_quota; - unsigned short slot_use_count; - unsigned short message_quota; - unsigned short message_use_count; - struct completion quota_event; - int previous_tx_index; -}; - -struct vchiq_shared_state { - /* A non-zero value here indicates that the content is valid. */ - int initialised; - - /* The first and last (inclusive) slots allocated to the owner. */ - int slot_first; - int slot_last; - - /* The slot allocated to synchronous messages from the owner. */ - int slot_sync; - - /* - * Signalling this event indicates that owner's slot handler thread - * should run. - */ - struct remote_event trigger; - - /* - * Indicates the byte position within the stream where the next message - * will be written. The least significant bits are an index into the - * slot. The next bits are the index of the slot in slot_queue. - */ - int tx_pos; - - /* This event should be signalled when a slot is recycled. */ - struct remote_event recycle; - - /* The slot_queue index where the next recycled slot will be written. */ - int slot_queue_recycle; - - /* This event should be signalled when a synchronous message is sent. */ - struct remote_event sync_trigger; - - /* - * This event should be signalled when a synchronous message has been - * released. - */ - struct remote_event sync_release; - - /* A circular buffer of slot indexes. */ - int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; - - /* Debugging state */ - int debug[DEBUG_MAX]; -}; - -/* - * vchiq_slot_zero describes the memory shared between the ARM host and the - * VideoCore VPU. The "master" and "slave" states are owned by the respective - * sides but visible to the other; the slots are shared, and the remaining - * fields are read-only. - * - * In the configuration used by this implementation, the memory is allocated - * by the host, the VPU is the master (the side which controls the DMA for bulk - * transfers), and the host is the slave. - * - * The ownership of slots changes with use: - * + When empty they are owned by the sender. - * + When partially filled they are shared with the receiver. - * + When completely full they are owned by the receiver. - * + When the receiver has finished processing the contents, they are recycled - * back to the sender. - */ -struct vchiq_slot_zero { - int magic; - short version; - short version_min; - int slot_zero_size; - int slot_size; - int max_slots; - int max_slots_per_side; - int platform_data[2]; - struct vchiq_shared_state master; - struct vchiq_shared_state slave; - struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS]; -}; - -/* - * This is the private runtime state used by each side. The same structure was - * originally used by both sides, but implementations have since diverged. - */ -struct vchiq_state { - struct device *dev; - int id; - int initialised; - enum vchiq_connstate conn_state; - short version_common; - - struct vchiq_shared_state *local; - struct vchiq_shared_state *remote; - struct vchiq_slot *slot_data; - - unsigned short default_slot_quota; - unsigned short default_message_quota; - - /* Event indicating connect message received */ - struct completion connect; - - /* Mutex protecting services */ - struct mutex mutex; - struct vchiq_instance **instance; - - /* Processes all incoming messages which aren't synchronous */ - struct task_struct *slot_handler_thread; - - /* - * Slots which have been fully processed and released by the (peer) - * receiver are added to the receiver queue, which is asynchronously - * processed by the recycle thread. - */ - struct task_struct *recycle_thread; - - /* - * Processes incoming synchronous messages - * - * The synchronous message channel is shared between all synchronous - * services, and provides a way for urgent messages to bypass - * potentially long queues of asynchronous messages in the normal slots. - * - * There can be only one outstanding synchronous message in - * each direction, and as a precious shared resource synchronous - * services should be used sparingly. - */ - struct task_struct *sync_thread; - - /* Local implementation of the trigger remote event */ - wait_queue_head_t trigger_event; - - /* Local implementation of the recycle remote event */ - wait_queue_head_t recycle_event; - - /* Local implementation of the sync trigger remote event */ - wait_queue_head_t sync_trigger_event; - - /* Local implementation of the sync release remote event */ - wait_queue_head_t sync_release_event; - - char *tx_data; - char *rx_data; - struct vchiq_slot_info *rx_info; - - struct mutex slot_mutex; - - struct mutex recycle_mutex; - - struct mutex sync_mutex; - - spinlock_t msg_queue_spinlock; - - spinlock_t bulk_waiter_spinlock; - - spinlock_t quota_spinlock; - - /* - * Indicates the byte position within the stream from where the next - * message will be read. The least significant bits are an index into - * the slot.The next bits are the index of the slot in - * remote->slot_queue. - */ - int rx_pos; - - /* - * A cached copy of local->tx_pos. Only write to local->tx_pos, and read - * from remote->tx_pos. - */ - int local_tx_pos; - - /* The slot_queue index of the slot to become available next. */ - int slot_queue_available; - - /* A flag to indicate if any poll has been requested */ - int poll_needed; - - /* Ths index of the previous slot used for data messages. */ - int previous_data_index; - - /* The number of slots occupied by data messages. */ - unsigned short data_use_count; - - /* The maximum number of slots to be occupied by data messages. */ - unsigned short data_quota; - - /* An array of bit sets indicating which services must be polled. */ - atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; - - /* The number of the first unused service */ - int unused_service; - - /* Signalled when a free slot becomes available. */ - struct completion slot_available_event; - - /* Signalled when a free data slot becomes available. */ - struct completion data_quota_event; - - struct state_stats_struct { - int slot_stalls; - int data_stalls; - int ctrl_tx_count; - int ctrl_rx_count; - int error_count; - } stats; - - struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES]; - struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES]; - struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS]; - - struct opaque_platform_state *platform_state; -}; - -struct pagelist { - u32 length; - u16 type; - u16 offset; - u32 addrs[1]; /* N.B. 12 LSBs hold the number - * of following pages at consecutive - * addresses. - */ -}; - -struct vchiq_pagelist_info { - struct pagelist *pagelist; - size_t pagelist_buffer_size; - dma_addr_t dma_addr; - enum dma_data_direction dma_dir; - unsigned int num_pages; - unsigned int pages_need_release; - struct page **pages; - struct scatterlist *scatterlist; - unsigned int scatterlist_mapped; -}; - -static inline bool vchiq_remote_initialised(const struct vchiq_state *state) -{ - return state->remote && state->remote->initialised; -} - -struct bulk_waiter { - struct vchiq_bulk *bulk; - struct completion event; - int actual; -}; - -struct vchiq_config { - unsigned int max_msg_size; - unsigned int bulk_threshold; /* The message size above which it - * is better to use a bulk transfer - * (<= max_msg_size) - */ - unsigned int max_outstanding_bulks; - unsigned int max_services; - short version; /* The version of VCHIQ */ - short version_min; /* The minimum compatible version of VCHIQ */ -}; - -extern spinlock_t bulk_waiter_spinlock; - -extern const char * -get_conn_state_name(enum vchiq_connstate conn_state); - -extern struct vchiq_slot_zero * -vchiq_init_slots(struct device *dev, void *mem_base, int mem_size); - -extern int -vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev); - -extern int -vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance); - -struct vchiq_service * -vchiq_add_service_internal(struct vchiq_state *state, - const struct vchiq_service_params_kernel *params, - int srvstate, struct vchiq_instance *instance, - void (*userdata_term)(void *userdata)); - -extern int -vchiq_open_service_internal(struct vchiq_service *service, int client_id); - -extern int -vchiq_close_service_internal(struct vchiq_service *service, int close_recvd); - -extern void -vchiq_terminate_service_internal(struct vchiq_service *service); - -extern void -vchiq_free_service_internal(struct vchiq_service *service); - -extern void -vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance); - -extern void -remote_event_pollall(struct vchiq_state *state); - -extern int -vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle, - struct bulk_waiter *userdata); - -extern int -vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_bulk *bulk); - -extern int -vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle, - struct vchiq_bulk *bulk); - -extern void -vchiq_dump_state(struct seq_file *f, struct vchiq_state *state); - -extern void -request_poll(struct vchiq_state *state, struct vchiq_service *service, - int poll_type); - -struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_service_by_handle(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_service_by_port(struct vchiq_state *state, unsigned int localport); - -extern struct vchiq_service * -find_service_for_instance(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle); - -extern struct vchiq_service * -__next_service_by_instance(struct vchiq_state *state, - struct vchiq_instance *instance, - int *pidx); - -extern struct vchiq_service * -next_service_by_instance(struct vchiq_state *state, - struct vchiq_instance *instance, - int *pidx); - -extern void -vchiq_service_get(struct vchiq_service *service); - -extern void -vchiq_service_put(struct vchiq_service *service); - -extern int -vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle, - ssize_t (*copy_callback)(void *context, void *dest, - size_t offset, size_t maxsize), - void *context, - size_t size); - -void vchiq_dump_platform_state(struct seq_file *f); - -void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f); - -void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service); - -int vchiq_use_service_internal(struct vchiq_service *service); - -int vchiq_release_service_internal(struct vchiq_service *service); - -void vchiq_on_remote_use(struct vchiq_state *state); - -void vchiq_on_remote_release(struct vchiq_state *state); - -int vchiq_platform_init_state(struct vchiq_state *state); - -int vchiq_check_service(struct vchiq_service *service); - -int vchiq_send_remote_use(struct vchiq_state *state); - -int vchiq_send_remote_use_active(struct vchiq_state *state); - -void vchiq_platform_conn_state_changed(struct vchiq_state *state, - enum vchiq_connstate oldstate, - enum vchiq_connstate newstate); - -void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate); - -void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr, - const void *void_mem, size_t num_bytes); - -int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service); - -int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service); - -void vchiq_get_config(struct vchiq_config *config); - -int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service, - enum vchiq_service_option option, int value); - -#endif diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c index d5f7f61c5626..c82326a9b6d9 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.c @@ -5,9 +5,9 @@ */ #include -#include "vchiq_core.h" -#include "vchiq_arm.h" -#include "vchiq_debugfs.h" +#include +#include +#include #ifdef CONFIG_DEBUG_FS diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h deleted file mode 100644 index b29e6693c949..000000000000 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_debugfs.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */ - -#ifndef VCHIQ_DEBUGFS_H -#define VCHIQ_DEBUGFS_H - -struct vchiq_state; -struct vchiq_instance; - -struct vchiq_debugfs_node { - struct dentry *dentry; -}; - -void vchiq_debugfs_init(struct vchiq_state *state); - -void vchiq_debugfs_deinit(void); - -void vchiq_debugfs_add_instance(struct vchiq_instance *instance); - -void vchiq_debugfs_remove_instance(struct vchiq_instance *instance); - -#endif /* VCHIQ_DEBUGFS_H */ diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c index 3b20ba5c7362..0f3dde2657d6 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_dev.c @@ -11,10 +11,11 @@ #include #include -#include "vchiq_core.h" +#include +#include +#include + #include "vchiq_ioctl.h" -#include "vchiq_arm.h" -#include "vchiq_debugfs.h" static const char *const ioctl_names[] = { "CONNECT", diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h index afb71a83cfe7..d0c759f6d8ea 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_ioctl.h @@ -5,8 +5,7 @@ #define VCHIQ_IOCTLS_H #include - -#include "../../include/linux/raspberrypi/vchiq.h" +#include #define VCHIQ_IOC_MAGIC 0xc4 #define VCHIQ_INVALID_HANDLE (~0) diff --git a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c index c2b5a37915f2..cd073ed3ea2d 100644 --- a/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c +++ b/drivers/staging/vc04_services/vchiq-mmal/mmal-vchiq.c @@ -22,11 +22,12 @@ #include #include #include +#include #include #include -#include "../include/linux/raspberrypi/vchiq.h" -#include "../interface/vchiq_arm/vchiq_arm.h" +#include + #include "mmal-common.h" #include "mmal-vchiq.h" #include "mmal-msg.h" diff --git a/include/linux/raspberrypi/vchiq.h b/include/linux/raspberrypi/vchiq.h new file mode 100644 index 000000000000..ee4469f4fc51 --- /dev/null +++ b/include/linux/raspberrypi/vchiq.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_H +#define VCHIQ_H + +#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ + (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) + +enum vchiq_reason { + VCHIQ_SERVICE_OPENED, /* service, -, - */ + VCHIQ_SERVICE_CLOSED, /* service, -, - */ + VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ + VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ +}; + +enum vchiq_bulk_mode { + VCHIQ_BULK_MODE_CALLBACK, + VCHIQ_BULK_MODE_BLOCKING, + VCHIQ_BULK_MODE_NOCALLBACK, + VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ +}; + +enum vchiq_service_option { + VCHIQ_SERVICE_OPTION_AUTOCLOSE, + VCHIQ_SERVICE_OPTION_SLOT_QUOTA, + VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, + VCHIQ_SERVICE_OPTION_SYNCHRONOUS, + VCHIQ_SERVICE_OPTION_TRACE +}; + +struct vchiq_header { + /* The message identifier - opaque to applications. */ + int msgid; + + /* Size of message data. */ + unsigned int size; + + char data[]; /* message */ +}; + +struct vchiq_element { + const void __user *data; + unsigned int size; +}; + +struct vchiq_instance; +struct vchiq_state; + +struct vchiq_service_base { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; +}; + +struct vchiq_completion_data_kernel { + enum vchiq_reason reason; + struct vchiq_header *header; + void *service_userdata; + void *cb_data; + void __user *cb_userdata; +}; + +struct vchiq_service_params_kernel { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; + short version; /* Increment for non-trivial changes */ + short version_min; /* Update for incompatible changes */ +}; + +extern int vchiq_initialise(struct vchiq_state *state, + struct vchiq_instance **pinstance); +extern int vchiq_shutdown(struct vchiq_instance *instance); +extern int vchiq_connect(struct vchiq_instance *instance); +extern int vchiq_open_service(struct vchiq_instance *instance, + const struct vchiq_service_params_kernel *params, + unsigned int *pservice); +extern int vchiq_close_service(struct vchiq_instance *instance, + unsigned int service); +extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_release_service(struct vchiq_instance *instance, + unsigned int service); +extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_header *header); +extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service, + struct vchiq_header *header); +extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle, + void *data, unsigned int size); +extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service, + const void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service, + void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle, + short *peer_version); +extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle); + +#endif /* VCHIQ_H */ diff --git a/include/linux/raspberrypi/vchiq_arm.h b/include/linux/raspberrypi/vchiq_arm.h new file mode 100644 index 000000000000..e32b02f99024 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_arm.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + */ + +#ifndef VCHIQ_ARM_H +#define VCHIQ_ARM_H + +#include +#include +#include +#include +#include "vchiq_core.h" +#include "vchiq_debugfs.h" + +/* Some per-instance constants */ +#define MAX_COMPLETIONS 128 +#define MAX_SERVICES 64 +#define MAX_ELEMENTS 8 +#define MSG_QUEUE_SIZE 128 + +#define VCHIQ_DRV_MAX_CALLBACKS 10 + +struct rpi_firmware; +struct vchiq_device; + +enum USE_TYPE_E { + USE_TYPE_SERVICE, + USE_TYPE_VCHIQ +}; + +struct vchiq_platform_info { + unsigned int cache_line_size; +}; + +struct vchiq_drv_mgmt { + struct rpi_firmware *fw; + const struct vchiq_platform_info *info; + + bool connected; + int num_deferred_callbacks; + /* Protects connected and num_deferred_callbacks */ + struct mutex connected_mutex; + + void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void); + + struct semaphore free_fragments_sema; + struct semaphore free_fragments_mutex; + char *fragments_base; + char *free_fragments; + unsigned int fragments_size; + + void __iomem *regs; + + struct vchiq_state state; +}; + +struct user_service { + struct vchiq_service *service; + void __user *userdata; + struct vchiq_instance *instance; + char is_vchi; + char dequeue_pending; + char close_pending; + int message_available_pos; + int msg_insert; + int msg_remove; + struct completion insert_event; + struct completion remove_event; + struct completion close_event; + struct vchiq_header *msg_queue[MSG_QUEUE_SIZE]; +}; + +struct bulk_waiter_node { + struct bulk_waiter bulk_waiter; + int pid; + struct list_head list; +}; + +struct vchiq_instance { + struct vchiq_state *state; + struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS]; + int completion_insert; + int completion_remove; + struct completion insert_event; + struct completion remove_event; + struct mutex completion_mutex; + + int connected; + int closing; + int pid; + int mark; + int use_close_delivered; + int trace; + + struct list_head bulk_waiter_list; + struct mutex bulk_waiter_list_mutex; + + struct vchiq_debugfs_node debugfs_node; +}; + +int +vchiq_use_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_release_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_check_service(struct vchiq_service *service); + +extern void +vchiq_dump_service_use_state(struct vchiq_state *state); + +extern int +vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service, + enum USE_TYPE_E use_type); +extern int +vchiq_release_internal(struct vchiq_state *state, + struct vchiq_service *service); + +extern struct vchiq_debugfs_node * +vchiq_instance_get_debugfs_node(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_use_count(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_pid(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_trace(struct vchiq_instance *instance); + +extern void +vchiq_instance_set_trace(struct vchiq_instance *instance, int trace); + +extern void +vchiq_add_connected_callback(struct vchiq_device *device, + void (*callback)(void)); + +#if IS_ENABLED(CONFIG_VCHIQ_CDEV) + +extern void +vchiq_deregister_chrdev(void); + +extern int +vchiq_register_chrdev(struct device *parent); + +#else + +static inline void vchiq_deregister_chrdev(void) { } +static inline int vchiq_register_chrdev(struct device *parent) { return 0; } + +#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */ + +extern int +service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason, + struct vchiq_header *header, unsigned int handle, + void *cb_data, void __user *cb_userdata); + +extern void +free_bulk_waiter(struct vchiq_instance *instance); + +#endif /* VCHIQ_ARM_H */ diff --git a/include/linux/raspberrypi/vchiq_bus.h b/include/linux/raspberrypi/vchiq_bus.h new file mode 100644 index 000000000000..9de179b39f85 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_bus.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Ideas On Board Oy + */ + +#ifndef _VCHIQ_DEVICE_H +#define _VCHIQ_DEVICE_H + +#include +#include + +struct vchiq_drv_mgmt; + +struct vchiq_device { + struct device dev; + struct vchiq_drv_mgmt *drv_mgmt; +}; + +struct vchiq_driver { + int (*probe)(struct vchiq_device *device); + void (*remove)(struct vchiq_device *device); + int (*resume)(struct vchiq_device *device); + int (*suspend)(struct vchiq_device *device, + pm_message_t state); + + const struct vchiq_device_id *id_table; + struct device_driver driver; +}; + +static inline struct vchiq_device *to_vchiq_device(struct device *d) +{ + return container_of(d, struct vchiq_device, dev); +} + +static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d) +{ + return container_of(d, struct vchiq_driver, driver); +} + +extern const struct bus_type vchiq_bus_type; + +struct vchiq_device * +vchiq_device_register(struct device *parent, const char *name); +void vchiq_device_unregister(struct vchiq_device *dev); + +int vchiq_driver_register(struct vchiq_driver *vchiq_drv); +void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv); + +/** + * module_vchiq_driver() - Helper macro for registering a vchiq driver + * @__vchiq_driver: vchiq driver struct + * + * Helper macro for vchiq drivers which do not do anything special in + * module init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_vchiq_driver(__vchiq_driver) \ + module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister) + +#endif /* _VCHIQ_DEVICE_H */ diff --git a/include/linux/raspberrypi/vchiq_cfg.h b/include/linux/raspberrypi/vchiq_cfg.h new file mode 100644 index 000000000000..a16d0299996c --- /dev/null +++ b/include/linux/raspberrypi/vchiq_cfg.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CFG_H +#define VCHIQ_CFG_H + +#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') +/* The version of VCHIQ - change with any non-trivial change */ +#define VCHIQ_VERSION 8 +/* + * The minimum compatible version - update to match VCHIQ_VERSION with any + * incompatible change + */ +#define VCHIQ_VERSION_MIN 3 + +/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */ +#define VCHIQ_VERSION_LIB_VERSION 7 + +/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */ +#define VCHIQ_VERSION_CLOSE_DELIVERED 7 + +/* The version that made it safe to use SYNCHRONOUS mode */ +#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8 + +#define VCHIQ_MAX_STATES 1 +#define VCHIQ_MAX_SERVICES 4096 +#define VCHIQ_MAX_SLOTS 128 +#define VCHIQ_MAX_SLOTS_PER_SIDE 64 + +#define VCHIQ_NUM_CURRENT_BULKS 32 +#define VCHIQ_NUM_SERVICE_BULKS 4 + +#ifndef VCHIQ_ENABLE_DEBUG +#define VCHIQ_ENABLE_DEBUG 1 +#endif + +#ifndef VCHIQ_ENABLE_STATS +#define VCHIQ_ENABLE_STATS 1 +#endif + +#endif /* VCHIQ_CFG_H */ diff --git a/include/linux/raspberrypi/vchiq_core.h b/include/linux/raspberrypi/vchiq_core.h new file mode 100644 index 000000000000..e7bf7a114985 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_core.h @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CORE_H +#define VCHIQ_CORE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vchiq.h" +#include "vchiq_cfg.h" + +/* Do this so that we can test-build the code on non-rpi systems */ +#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) + +#else + +#ifndef dsb +#define dsb(a) +#endif + +#endif /* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */ + +#define VCHIQ_SERVICE_HANDLE_INVALID 0 + +#define VCHIQ_SLOT_SIZE 4096 +#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header)) + +#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) +#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) +#define VCHIQ_SLOT_ZERO_SLOTS DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \ + VCHIQ_SLOT_SIZE) + +#define BITSET_SIZE(b) ((b + 31) >> 5) +#define BITSET_WORD(b) (b >> 5) +#define BITSET_BIT(b) (1 << (b & 31)) +#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) +#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) + +enum { + DEBUG_ENTRIES, +#if VCHIQ_ENABLE_DEBUG + DEBUG_SLOT_HANDLER_COUNT, + DEBUG_SLOT_HANDLER_LINE, + DEBUG_PARSE_LINE, + DEBUG_PARSE_HEADER, + DEBUG_PARSE_MSGID, + DEBUG_AWAIT_COMPLETION_LINE, + DEBUG_DEQUEUE_MESSAGE_LINE, + DEBUG_SERVICE_CALLBACK_LINE, + DEBUG_MSG_QUEUE_FULL_COUNT, + DEBUG_COMPLETION_QUEUE_FULL_COUNT, +#endif + DEBUG_MAX +}; + +#if VCHIQ_ENABLE_DEBUG + +#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug +#define DEBUG_TRACE(d) \ + do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0) +#define DEBUG_VALUE(d, v) \ + do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0) +#define DEBUG_COUNT(d) \ + do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0) + +#else /* VCHIQ_ENABLE_DEBUG */ + +#define DEBUG_INITIALISE(local) +#define DEBUG_TRACE(d) +#define DEBUG_VALUE(d, v) +#define DEBUG_COUNT(d) + +#endif /* VCHIQ_ENABLE_DEBUG */ + +enum vchiq_connstate { + VCHIQ_CONNSTATE_DISCONNECTED, + VCHIQ_CONNSTATE_CONNECTING, + VCHIQ_CONNSTATE_CONNECTED, + VCHIQ_CONNSTATE_PAUSING, + VCHIQ_CONNSTATE_PAUSE_SENT, + VCHIQ_CONNSTATE_PAUSED, + VCHIQ_CONNSTATE_RESUMING, + VCHIQ_CONNSTATE_PAUSE_TIMEOUT, + VCHIQ_CONNSTATE_RESUME_TIMEOUT +}; + +enum { + VCHIQ_SRVSTATE_FREE, + VCHIQ_SRVSTATE_HIDDEN, + VCHIQ_SRVSTATE_LISTENING, + VCHIQ_SRVSTATE_OPENING, + VCHIQ_SRVSTATE_OPEN, + VCHIQ_SRVSTATE_OPENSYNC, + VCHIQ_SRVSTATE_CLOSESENT, + VCHIQ_SRVSTATE_CLOSERECVD, + VCHIQ_SRVSTATE_CLOSEWAIT, + VCHIQ_SRVSTATE_CLOSED +}; + +enum vchiq_bulk_dir { + VCHIQ_BULK_TRANSMIT, + VCHIQ_BULK_RECEIVE +}; + +struct vchiq_bulk { + short mode; + short dir; + void *cb_data; + void __user *cb_userdata; + struct bulk_waiter *waiter; + dma_addr_t dma_addr; + int size; + void *remote_data; + int remote_size; + int actual; + void *offset; + void __user *uoffset; +}; + +struct vchiq_bulk_queue { + int local_insert; /* Where to insert the next local bulk */ + int remote_insert; /* Where to insert the next remote bulk (master) */ + int process; /* Bulk to transfer next */ + int remote_notify; /* Bulk to notify the remote client of next (mstr) */ + int remove; /* Bulk to notify the local client of, and remove, next */ + struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS]; +}; + +/* + * Remote events provide a way of presenting several virtual doorbells to a + * peer (ARM host to VPU) using only one physical doorbell. They can be thought + * of as a way for the peer to signal a semaphore, in this case implemented as + * a workqueue. + * + * Remote events remain signalled until acknowledged by the receiver, and they + * are non-counting. They are designed in such a way as to minimise the number + * of interrupts and avoid unnecessary waiting. + * + * A remote_event is as small data structures that live in shared memory. It + * comprises two booleans - armed and fired: + * + * The sender sets fired when they signal the receiver. + * If fired is set, the receiver has been signalled and need not wait. + * The receiver sets the armed field before they begin to wait. + * If armed is set, the receiver is waiting and wishes to be woken by interrupt. + */ +struct remote_event { + int armed; + int fired; + u32 __unused; +}; + +struct opaque_platform_state; + +struct vchiq_slot { + char data[VCHIQ_SLOT_SIZE]; +}; + +struct vchiq_slot_info { + /* Use two counters rather than one to avoid the need for a mutex. */ + short use_count; + short release_count; +}; + +/* + * VCHIQ is a reliable connection-oriented datagram protocol. + * + * A VCHIQ service is equivalent to a TCP connection, except: + * + FOURCCs are used for the rendezvous, and port numbers are assigned at the + * time the connection is established. + * + There is less of a distinction between server and client sockets, the only + * difference being which end makes the first move. + * + For a multi-client server, the server creates new "listening" services as + * the existing one becomes connected - there is no need to specify the + * maximum number of clients up front. + * + Data transfer is reliable but packetized (messages have defined ends). + * + Messages can be either short (capable of fitting in a slot) and in-band, + * or copied between external buffers (bulk transfers). + */ +struct vchiq_service { + struct vchiq_service_base base; + unsigned int handle; + struct kref ref_count; + struct rcu_head rcu; + int srvstate; + void (*userdata_term)(void *userdata); + unsigned int localport; + unsigned int remoteport; + int public_fourcc; + int client_id; + char auto_close; + char sync; + char closing; + char trace; + atomic_t poll_flags; + short version; + short version_min; + short peer_version; + + struct vchiq_state *state; + struct vchiq_instance *instance; + + int service_use_count; + + struct vchiq_bulk_queue bulk_tx; + struct vchiq_bulk_queue bulk_rx; + + struct completion remove_event; + struct completion bulk_remove_event; + struct mutex bulk_mutex; + + struct service_stats_struct { + int quota_stalls; + int slot_stalls; + int bulk_stalls; + int error_count; + int ctrl_tx_count; + int ctrl_rx_count; + int bulk_tx_count; + int bulk_rx_count; + int bulk_aborted_count; + u64 ctrl_tx_bytes; + u64 ctrl_rx_bytes; + u64 bulk_tx_bytes; + u64 bulk_rx_bytes; + } stats; + + int msg_queue_read; + int msg_queue_write; + struct completion msg_queue_pop; + struct completion msg_queue_push; + struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS]; +}; + +/* + * The quota information is outside struct vchiq_service so that it can + * be statically allocated, since for accounting reasons a service's slot + * usage is carried over between users of the same port number. + */ +struct vchiq_service_quota { + unsigned short slot_quota; + unsigned short slot_use_count; + unsigned short message_quota; + unsigned short message_use_count; + struct completion quota_event; + int previous_tx_index; +}; + +struct vchiq_shared_state { + /* A non-zero value here indicates that the content is valid. */ + int initialised; + + /* The first and last (inclusive) slots allocated to the owner. */ + int slot_first; + int slot_last; + + /* The slot allocated to synchronous messages from the owner. */ + int slot_sync; + + /* + * Signalling this event indicates that owner's slot handler thread + * should run. + */ + struct remote_event trigger; + + /* + * Indicates the byte position within the stream where the next message + * will be written. The least significant bits are an index into the + * slot. The next bits are the index of the slot in slot_queue. + */ + int tx_pos; + + /* This event should be signalled when a slot is recycled. */ + struct remote_event recycle; + + /* The slot_queue index where the next recycled slot will be written. */ + int slot_queue_recycle; + + /* This event should be signalled when a synchronous message is sent. */ + struct remote_event sync_trigger; + + /* + * This event should be signalled when a synchronous message has been + * released. + */ + struct remote_event sync_release; + + /* A circular buffer of slot indexes. */ + int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; + + /* Debugging state */ + int debug[DEBUG_MAX]; +}; + +/* + * vchiq_slot_zero describes the memory shared between the ARM host and the + * VideoCore VPU. The "master" and "slave" states are owned by the respective + * sides but visible to the other; the slots are shared, and the remaining + * fields are read-only. + * + * In the configuration used by this implementation, the memory is allocated + * by the host, the VPU is the master (the side which controls the DMA for bulk + * transfers), and the host is the slave. + * + * The ownership of slots changes with use: + * + When empty they are owned by the sender. + * + When partially filled they are shared with the receiver. + * + When completely full they are owned by the receiver. + * + When the receiver has finished processing the contents, they are recycled + * back to the sender. + */ +struct vchiq_slot_zero { + int magic; + short version; + short version_min; + int slot_zero_size; + int slot_size; + int max_slots; + int max_slots_per_side; + int platform_data[2]; + struct vchiq_shared_state master; + struct vchiq_shared_state slave; + struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS]; +}; + +/* + * This is the private runtime state used by each side. The same structure was + * originally used by both sides, but implementations have since diverged. + */ +struct vchiq_state { + struct device *dev; + int id; + int initialised; + enum vchiq_connstate conn_state; + short version_common; + + struct vchiq_shared_state *local; + struct vchiq_shared_state *remote; + struct vchiq_slot *slot_data; + + unsigned short default_slot_quota; + unsigned short default_message_quota; + + /* Event indicating connect message received */ + struct completion connect; + + /* Mutex protecting services */ + struct mutex mutex; + struct vchiq_instance **instance; + + /* Processes all incoming messages which aren't synchronous */ + struct task_struct *slot_handler_thread; + + /* + * Slots which have been fully processed and released by the (peer) + * receiver are added to the receiver queue, which is asynchronously + * processed by the recycle thread. + */ + struct task_struct *recycle_thread; + + /* + * Processes incoming synchronous messages + * + * The synchronous message channel is shared between all synchronous + * services, and provides a way for urgent messages to bypass + * potentially long queues of asynchronous messages in the normal slots. + * + * There can be only one outstanding synchronous message in + * each direction, and as a precious shared resource synchronous + * services should be used sparingly. + */ + struct task_struct *sync_thread; + + /* Local implementation of the trigger remote event */ + wait_queue_head_t trigger_event; + + /* Local implementation of the recycle remote event */ + wait_queue_head_t recycle_event; + + /* Local implementation of the sync trigger remote event */ + wait_queue_head_t sync_trigger_event; + + /* Local implementation of the sync release remote event */ + wait_queue_head_t sync_release_event; + + char *tx_data; + char *rx_data; + struct vchiq_slot_info *rx_info; + + struct mutex slot_mutex; + + struct mutex recycle_mutex; + + struct mutex sync_mutex; + + spinlock_t msg_queue_spinlock; + + spinlock_t bulk_waiter_spinlock; + + spinlock_t quota_spinlock; + + /* + * Indicates the byte position within the stream from where the next + * message will be read. The least significant bits are an index into + * the slot.The next bits are the index of the slot in + * remote->slot_queue. + */ + int rx_pos; + + /* + * A cached copy of local->tx_pos. Only write to local->tx_pos, and read + * from remote->tx_pos. + */ + int local_tx_pos; + + /* The slot_queue index of the slot to become available next. */ + int slot_queue_available; + + /* A flag to indicate if any poll has been requested */ + int poll_needed; + + /* Ths index of the previous slot used for data messages. */ + int previous_data_index; + + /* The number of slots occupied by data messages. */ + unsigned short data_use_count; + + /* The maximum number of slots to be occupied by data messages. */ + unsigned short data_quota; + + /* An array of bit sets indicating which services must be polled. */ + atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; + + /* The number of the first unused service */ + int unused_service; + + /* Signalled when a free slot becomes available. */ + struct completion slot_available_event; + + /* Signalled when a free data slot becomes available. */ + struct completion data_quota_event; + + struct state_stats_struct { + int slot_stalls; + int data_stalls; + int ctrl_tx_count; + int ctrl_rx_count; + int error_count; + } stats; + + struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES]; + struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES]; + struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS]; + + struct opaque_platform_state *platform_state; +}; + +struct pagelist { + u32 length; + u16 type; + u16 offset; + u32 addrs[1]; /* N.B. 12 LSBs hold the number + * of following pages at consecutive + * addresses. + */ +}; + +struct vchiq_pagelist_info { + struct pagelist *pagelist; + size_t pagelist_buffer_size; + dma_addr_t dma_addr; + enum dma_data_direction dma_dir; + unsigned int num_pages; + unsigned int pages_need_release; + struct page **pages; + struct scatterlist *scatterlist; + unsigned int scatterlist_mapped; +}; + +static inline bool vchiq_remote_initialised(const struct vchiq_state *state) +{ + return state->remote && state->remote->initialised; +} + +struct bulk_waiter { + struct vchiq_bulk *bulk; + struct completion event; + int actual; +}; + +struct vchiq_config { + unsigned int max_msg_size; + unsigned int bulk_threshold; /* The message size above which it + * is better to use a bulk transfer + * (<= max_msg_size) + */ + unsigned int max_outstanding_bulks; + unsigned int max_services; + short version; /* The version of VCHIQ */ + short version_min; /* The minimum compatible version of VCHIQ */ +}; + +extern spinlock_t bulk_waiter_spinlock; + +extern const char * +get_conn_state_name(enum vchiq_connstate conn_state); + +extern struct vchiq_slot_zero * +vchiq_init_slots(struct device *dev, void *mem_base, int mem_size); + +extern int +vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev); + +extern int +vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +struct vchiq_service * +vchiq_add_service_internal(struct vchiq_state *state, + const struct vchiq_service_params_kernel *params, + int srvstate, struct vchiq_instance *instance, + void (*userdata_term)(void *userdata)); + +extern int +vchiq_open_service_internal(struct vchiq_service *service, int client_id); + +extern int +vchiq_close_service_internal(struct vchiq_service *service, int close_recvd); + +extern void +vchiq_terminate_service_internal(struct vchiq_service *service); + +extern void +vchiq_free_service_internal(struct vchiq_service *service); + +extern void +vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +extern void +remote_event_pollall(struct vchiq_state *state); + +extern int +vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle, + struct bulk_waiter *userdata); + +extern int +vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern int +vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern void +vchiq_dump_state(struct seq_file *f, struct vchiq_state *state); + +extern void +request_poll(struct vchiq_state *state, struct vchiq_service *service, + int poll_type); + +struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_handle(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_port(struct vchiq_state *state, unsigned int localport); + +extern struct vchiq_service * +find_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +__next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern struct vchiq_service * +next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern void +vchiq_service_get(struct vchiq_service *service); + +extern void +vchiq_service_put(struct vchiq_service *service); + +extern int +vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle, + ssize_t (*copy_callback)(void *context, void *dest, + size_t offset, size_t maxsize), + void *context, + size_t size); + +void vchiq_dump_platform_state(struct seq_file *f); + +void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f); + +void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service); + +int vchiq_use_service_internal(struct vchiq_service *service); + +int vchiq_release_service_internal(struct vchiq_service *service); + +void vchiq_on_remote_use(struct vchiq_state *state); + +void vchiq_on_remote_release(struct vchiq_state *state); + +int vchiq_platform_init_state(struct vchiq_state *state); + +int vchiq_check_service(struct vchiq_service *service); + +int vchiq_send_remote_use(struct vchiq_state *state); + +int vchiq_send_remote_use_active(struct vchiq_state *state); + +void vchiq_platform_conn_state_changed(struct vchiq_state *state, + enum vchiq_connstate oldstate, + enum vchiq_connstate newstate); + +void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate); + +void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr, + const void *void_mem, size_t num_bytes); + +int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service); + +int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service); + +void vchiq_get_config(struct vchiq_config *config); + +int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service, + enum vchiq_service_option option, int value); + +#endif diff --git a/include/linux/raspberrypi/vchiq_debugfs.h b/include/linux/raspberrypi/vchiq_debugfs.h new file mode 100644 index 000000000000..b29e6693c949 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_debugfs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */ + +#ifndef VCHIQ_DEBUGFS_H +#define VCHIQ_DEBUGFS_H + +struct vchiq_state; +struct vchiq_instance; + +struct vchiq_debugfs_node { + struct dentry *dentry; +}; + +void vchiq_debugfs_init(struct vchiq_state *state); + +void vchiq_debugfs_deinit(void); + +void vchiq_debugfs_add_instance(struct vchiq_instance *instance); + +void vchiq_debugfs_remove_instance(struct vchiq_instance *instance); + +#endif /* VCHIQ_DEBUGFS_H */ -- cgit v1.2.3 From 1d165919c8261b927f8dc8cfe61eb04342bedb7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Oct 2025 19:47:59 -0700 Subject: iio: imu: adis: fix all kernel-doc warnings in header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct and add to adis.h to resolve all kernel-doc warnings: - add a missing struct member description - change one non-kernel-doc comment to use /* instead of /** - correct function parameter @value to @val (7 locations) - add function return value comments (13 locations) Warning: include/linux/iio/imu/adis.h:97 struct member 'has_fifo' not described in 'adis_data' Warning: include/linux/iio/imu/adis.h:139 Incorrect use of kernel-doc format: * The state_lock is meant to be used during operations that require Warning: include/linux/iio/imu/adis.h:158 struct member '"__adis_"' not described in 'adis' Warning: include/linux/iio/imu/adis.h:264 function parameter 'val' not described in 'adis_write_reg' Warning: include/linux/iio/imu/adis.h:371 No description found for return value of 'adis_update_bits_base' Signed-off-by: Randy Dunlap Reviewed-by: Nuno Sá Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 45 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index aa160511e265..bfb6df68e6c9 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -57,6 +57,7 @@ struct adis_timeout { * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin * @has_paging: True if ADIS device has paged registers + * @has_fifo: True if ADIS device has a hardware FIFO * @burst_reg_cmd: Register command that triggers burst * @burst_len: Burst size in the SPI RX buffer. If @burst_max_len is defined, * this should be the minimum size supported by the device. @@ -136,7 +137,7 @@ struct adis { const struct adis_data *data; unsigned int burst_extra_len; const struct adis_ops *ops; - /** + /* * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer * information (fields 'xfer', 'msg' & 'current_page') between @@ -166,7 +167,7 @@ int __adis_reset(struct adis *adis); * adis_reset() - Reset the device * @adis: The adis device * - * Returns 0 on success, a negative error code otherwise + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_reset(struct adis *adis) { @@ -183,7 +184,9 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, * __adis_write_reg_8() - Write single byte to a register (unlocked) * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -195,7 +198,9 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, * __adis_write_reg_16() - Write 2 bytes to a pair of registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -207,7 +212,9 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, * __adis_write_reg_32() - write 4 bytes to four registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -220,6 +227,8 @@ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -239,6 +248,8 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -257,8 +268,10 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, * adis_write_reg() - write N bytes to register * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: The value to write to device (up to 4 bytes) + * @val: The value to write to device (up to 4 bytes) * @size: The size of the @value (in bytes) + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) @@ -273,6 +286,8 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, * @reg: The address of the lower of the two registers * @val: The value read back from the device * @size: The size of the @val buffer + * + * Returns: %0 on success, a negative error code otherwise */ static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) @@ -285,7 +300,9 @@ static int adis_read_reg(struct adis *adis, unsigned int reg, * adis_write_reg_8() - Write single byte to a register * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -297,7 +314,9 @@ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, * adis_write_reg_16() - Write 2 bytes to a pair of registers * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -309,7 +328,9 @@ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, * adis_write_reg_32() - write 4 bytes to four registers * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -322,6 +343,8 @@ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -341,6 +364,8 @@ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -366,6 +391,8 @@ int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, * @size: Size of the register to update * * Updates the desired bits of @reg in accordance with @mask and @val. + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, const u32 val, u8 size) -- cgit v1.2.3 From aaa5abcc9d44d2c8484f779ab46d242d774cabcb Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Thu, 25 Sep 2025 18:42:31 +0800 Subject: coresight: tmc: add the handle of the event to the path The handle is essential for retrieving the AUX_EVENT of each CPU and is required in perf mode. It has been added to the coresight_path so that dependent devices can access it from the path when needed. The existing bug can be reproduced with: perf record -e cs_etm//k -C 0-9 dd if=/dev/zero of=/dev/null Showing an oops as follows: Unable to handle kernel paging request at virtual address 000f6e84934ed19e Call trace: tmc_etr_get_buffer+0x30/0x80 [coresight_tmc] (P) catu_enable_hw+0xbc/0x3d0 [coresight_catu] catu_enable+0x70/0xe0 [coresight_catu] coresight_enable_path+0xb0/0x258 [coresight] Fixes: 080ee83cc361 ("Coresight: Change functions to accept the coresight_path") Signed-off-by: Carl Worth Reviewed-by: Leo Yan Co-developed-by: Jie Gan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-1-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-etm-perf.c | 1 + drivers/hwtracing/coresight/coresight-tmc-etr.c | 3 ++- include/linux/coresight.h | 10 ++++++---- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index f677c08233ba..5c256af6e54a 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -520,6 +520,7 @@ static void etm_event_start(struct perf_event *event, int flags) goto out; path = etm_event_cpu_path(event_data, cpu); + path->handle = handle; /* We need a sink, no need to continue without one */ sink = coresight_get_sink(path); if (WARN_ON_ONCE(!sink)) diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 800be06598c1..60b0e0a6da05 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1334,7 +1334,8 @@ out: struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, enum cs_mode mode, void *data) { - struct perf_output_handle *handle = data; + struct coresight_path *path = data; + struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf; switch (mode) { diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 6de59ce8ef8c..2626105e3719 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -332,12 +332,14 @@ static struct coresight_dev_list (var) = { \ /** * struct coresight_path - data needed by enable/disable path - * @path_list: path from source to sink. - * @trace_id: trace_id of the whole path. + * @path_list: path from source to sink. + * @trace_id: trace_id of the whole path. + * @handle: handle of the aux_event. */ struct coresight_path { - struct list_head path_list; - u8 trace_id; + struct list_head path_list; + u8 trace_id; + struct perf_output_handle *handle; }; enum cs_mode { -- cgit v1.2.3 From 94baedb51dea4b0c97e3c9acd90953bec98d03e7 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:32 +0800 Subject: coresight: change helper_ops to accept coresight_path Update the helper_enable and helper_disable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-2-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-catu.c | 10 +++++----- drivers/hwtracing/coresight/coresight-core.c | 20 ++++++++++++-------- drivers/hwtracing/coresight/coresight-ctcu-core.c | 9 +++------ drivers/hwtracing/coresight/coresight-cti-core.c | 5 +++-- drivers/hwtracing/coresight/coresight-cti.h | 5 +++-- drivers/hwtracing/coresight/coresight-tmc-etr.c | 4 ++-- drivers/hwtracing/coresight/coresight-tmc.h | 3 ++- include/linux/coresight.h | 5 +++-- 8 files changed, 33 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index a3ccb7034ae1..69b36bae97ab 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -397,7 +397,7 @@ static int catu_wait_for_ready(struct catu_drvdata *drvdata) } static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, - void *data) + struct coresight_path *path) { int rc; u32 control, mode; @@ -425,7 +425,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, etrdev = coresight_find_input_type( csdev->pdata, CORESIGHT_DEV_TYPE_SINK, etr_subtype); if (etrdev) { - etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, data); + etr_buf = tmc_etr_get_buffer(etrdev, cs_mode, path); if (IS_ERR(etr_buf)) return PTR_ERR(etr_buf); } @@ -455,7 +455,7 @@ static int catu_enable_hw(struct catu_drvdata *drvdata, enum cs_mode cs_mode, } static int catu_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { int rc = 0; struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev); @@ -463,7 +463,7 @@ static int catu_enable(struct coresight_device *csdev, enum cs_mode mode, guard(raw_spinlock_irqsave)(&catu_drvdata->spinlock); if (csdev->refcnt == 0) { CS_UNLOCK(catu_drvdata->base); - rc = catu_enable_hw(catu_drvdata, mode, data); + rc = catu_enable_hw(catu_drvdata, mode, path); CS_LOCK(catu_drvdata->base); } if (!rc) @@ -488,7 +488,7 @@ static int catu_disable_hw(struct catu_drvdata *drvdata) return rc; } -static int catu_disable(struct coresight_device *csdev, void *__unused) +static int catu_disable(struct coresight_device *csdev, struct coresight_path *path) { int rc = 0; struct catu_drvdata *catu_drvdata = csdev_to_catu_drvdata(csdev); diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 3267192f0c1c..f44ec9e5b692 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -355,17 +355,20 @@ static bool coresight_is_helper(struct coresight_device *csdev) } static int coresight_enable_helper(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - return helper_ops(csdev)->enable(csdev, mode, data); + return helper_ops(csdev)->enable(csdev, mode, path); } -static void coresight_disable_helper(struct coresight_device *csdev, void *data) +static void coresight_disable_helper(struct coresight_device *csdev, + struct coresight_path *path) { - helper_ops(csdev)->disable(csdev, data); + helper_ops(csdev)->disable(csdev, path); } -static void coresight_disable_helpers(struct coresight_device *csdev, void *data) +static void coresight_disable_helpers(struct coresight_device *csdev, + struct coresight_path *path) { int i; struct coresight_device *helper; @@ -373,7 +376,7 @@ static void coresight_disable_helpers(struct coresight_device *csdev, void *data for (i = 0; i < csdev->pdata->nr_outconns; ++i) { helper = csdev->pdata->out_conns[i]->dest_dev; if (helper && coresight_is_helper(helper)) - coresight_disable_helper(helper, data); + coresight_disable_helper(helper, path); } } @@ -479,7 +482,8 @@ void coresight_disable_path(struct coresight_path *path) EXPORT_SYMBOL_GPL(coresight_disable_path); static int coresight_enable_helpers(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { int i, ret = 0; struct coresight_device *helper; @@ -489,7 +493,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev, if (!helper || !coresight_is_helper(helper)) continue; - ret = coresight_enable_helper(helper, mode, data); + ret = coresight_enable_helper(helper, mode, path); if (ret) return ret; } diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index c586495e9a08..abed15eb72b4 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -156,17 +156,14 @@ static int ctcu_set_etr_traceid(struct coresight_device *csdev, struct coresight return __ctcu_set_etr_traceid(csdev, traceid, port_num, enable); } -static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode, void *data) +static int ctcu_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path) { - struct coresight_path *path = (struct coresight_path *)data; - return ctcu_set_etr_traceid(csdev, path, true); } -static int ctcu_disable(struct coresight_device *csdev, void *data) +static int ctcu_disable(struct coresight_device *csdev, struct coresight_path *path) { - struct coresight_path *path = (struct coresight_path *)data; - return ctcu_set_etr_traceid(csdev, path, false); } diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c index 8fb30dd73fd2..bfbc365bb2ef 100644 --- a/drivers/hwtracing/coresight/coresight-cti-core.c +++ b/drivers/hwtracing/coresight/coresight-cti-core.c @@ -799,14 +799,15 @@ static void cti_pm_release(struct cti_drvdata *drvdata) } /** cti ect operations **/ -int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data) +int cti_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path) { struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev); return cti_enable_hw(drvdata); } -int cti_disable(struct coresight_device *csdev, void *data) +int cti_disable(struct coresight_device *csdev, struct coresight_path *path) { struct cti_drvdata *drvdata = csdev_to_cti_drvdata(csdev); diff --git a/drivers/hwtracing/coresight/coresight-cti.h b/drivers/hwtracing/coresight/coresight-cti.h index 8362a47c939c..4f89091ee93f 100644 --- a/drivers/hwtracing/coresight/coresight-cti.h +++ b/drivers/hwtracing/coresight/coresight-cti.h @@ -216,8 +216,9 @@ int cti_add_connection_entry(struct device *dev, struct cti_drvdata *drvdata, const char *assoc_dev_name); struct cti_trig_con *cti_allocate_trig_con(struct device *dev, int in_sigs, int out_sigs); -int cti_enable(struct coresight_device *csdev, enum cs_mode mode, void *data); -int cti_disable(struct coresight_device *csdev, void *data); +int cti_enable(struct coresight_device *csdev, enum cs_mode mode, + struct coresight_path *path); +int cti_disable(struct coresight_device *csdev, struct coresight_path *path); void cti_write_all_hw_regs(struct cti_drvdata *drvdata); void cti_write_intack(struct device *dev, u32 ackval); void cti_write_single_reg(struct cti_drvdata *drvdata, int offset, u32 value); diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 60b0e0a6da05..51c6f73dd15c 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1332,9 +1332,9 @@ out: } struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - struct coresight_path *path = data; struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf; diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h index cbb4ba439158..95473d131032 100644 --- a/drivers/hwtracing/coresight/coresight-tmc.h +++ b/drivers/hwtracing/coresight/coresight-tmc.h @@ -442,7 +442,8 @@ struct coresight_device *tmc_etr_get_catu_device(struct tmc_drvdata *drvdata); void tmc_etr_set_catu_ops(const struct etr_buf_operations *catu); void tmc_etr_remove_catu_ops(void); struct etr_buf *tmc_etr_get_buffer(struct coresight_device *csdev, - enum cs_mode mode, void *data); + enum cs_mode mode, + struct coresight_path *path); extern const struct attribute_group coresight_etr_group; #endif diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2626105e3719..2bee2e3bb1c6 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -424,8 +424,9 @@ struct coresight_ops_source { */ struct coresight_ops_helper { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); - int (*disable)(struct coresight_device *csdev, void *data); + struct coresight_path *path); + int (*disable)(struct coresight_device *csdev, + struct coresight_path *path); }; -- cgit v1.2.3 From b139702a889692ec30702534ebb1ae2b11ed1cbf Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:33 +0800 Subject: coresight: change the sink_ops to accept coresight_path Update the sink_enable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-3-edd8a07c1646@oss.qualcomm.com --- drivers/hwtracing/coresight/coresight-core.c | 10 +++++----- drivers/hwtracing/coresight/coresight-dummy.c | 2 +- drivers/hwtracing/coresight/coresight-etb10.c | 8 ++++---- drivers/hwtracing/coresight/coresight-etm-perf.c | 2 +- drivers/hwtracing/coresight/coresight-priv.h | 3 +-- drivers/hwtracing/coresight/coresight-sysfs.c | 2 +- drivers/hwtracing/coresight/coresight-tmc-etf.c | 10 ++++++---- drivers/hwtracing/coresight/coresight-tmc-etr.c | 10 ++++++---- drivers/hwtracing/coresight/coresight-tpiu.c | 2 +- drivers/hwtracing/coresight/coresight-trbe.c | 4 ++-- drivers/hwtracing/coresight/ultrasoc-smb.c | 9 +++++---- include/linux/coresight.h | 2 +- 12 files changed, 34 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index f44ec9e5b692..c660cf8adb1c 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -300,9 +300,10 @@ unlock: EXPORT_SYMBOL_GPL(coresight_add_helper); static int coresight_enable_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { - return sink_ops(csdev)->enable(csdev, mode, data); + return sink_ops(csdev)->enable(csdev, mode, path); } static void coresight_disable_sink(struct coresight_device *csdev) @@ -501,8 +502,7 @@ static int coresight_enable_helpers(struct coresight_device *csdev, return 0; } -int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, - void *sink_data) +int coresight_enable_path(struct coresight_path *path, enum cs_mode mode) { int ret = 0; u32 type; @@ -532,7 +532,7 @@ int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, switch (type) { case CORESIGHT_DEV_TYPE_SINK: - ret = coresight_enable_sink(csdev, mode, sink_data); + ret = coresight_enable_sink(csdev, mode, path); /* * Sink is the first component turned on. If we * failed to enable the sink, there are no components diff --git a/drivers/hwtracing/coresight/coresight-dummy.c b/drivers/hwtracing/coresight/coresight-dummy.c index aaa92b5081e3..14322c99e29d 100644 --- a/drivers/hwtracing/coresight/coresight-dummy.c +++ b/drivers/hwtracing/coresight/coresight-dummy.c @@ -52,7 +52,7 @@ static int dummy_source_trace_id(struct coresight_device *csdev, __maybe_unused } static int dummy_sink_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { dev_dbg(csdev->dev.parent, "Dummy sink enabled\n"); diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c index 35db1b6093d1..6657602d8f2e 100644 --- a/drivers/hwtracing/coresight/coresight-etb10.c +++ b/drivers/hwtracing/coresight/coresight-etb10.c @@ -167,13 +167,13 @@ out: return ret; } -static int etb_enable_perf(struct coresight_device *csdev, void *data) +static int etb_enable_perf(struct coresight_device *csdev, struct coresight_path *path) { int ret = 0; pid_t pid; unsigned long flags; struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -224,7 +224,7 @@ out: } static int etb_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { int ret; @@ -233,7 +233,7 @@ static int etb_enable(struct coresight_device *csdev, enum cs_mode mode, ret = etb_enable_sysfs(csdev); break; case CS_MODE_PERF: - ret = etb_enable_perf(csdev, data); + ret = etb_enable_perf(csdev, path); break; default: ret = -EINVAL; diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c index 5c256af6e54a..17afa0f4cdee 100644 --- a/drivers/hwtracing/coresight/coresight-etm-perf.c +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c @@ -527,7 +527,7 @@ static void etm_event_start(struct perf_event *event, int flags) goto fail_end_stop; /* Nothing will happen without a path */ - if (coresight_enable_path(path, CS_MODE_PERF, handle)) + if (coresight_enable_path(path, CS_MODE_PERF)) goto fail_end_stop; /* Finally enable the tracer */ diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 33e22b1ba043..fd896ac07942 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -135,8 +135,7 @@ static inline void CS_UNLOCK(void __iomem *addr) } void coresight_disable_path(struct coresight_path *path); -int coresight_enable_path(struct coresight_path *path, enum cs_mode mode, - void *sink_data); +int coresight_enable_path(struct coresight_path *path, enum cs_mode mode); struct coresight_device *coresight_get_sink(struct coresight_path *path); struct coresight_device *coresight_get_sink_by_id(u32 id); struct coresight_device * diff --git a/drivers/hwtracing/coresight/coresight-sysfs.c b/drivers/hwtracing/coresight/coresight-sysfs.c index 5e52324aa9ac..d2a6ed8bcc74 100644 --- a/drivers/hwtracing/coresight/coresight-sysfs.c +++ b/drivers/hwtracing/coresight/coresight-sysfs.c @@ -215,7 +215,7 @@ int coresight_enable_sysfs(struct coresight_device *csdev) if (!IS_VALID_CS_TRACE_ID(path->trace_id)) goto err_path; - ret = coresight_enable_path(path, CS_MODE_SYSFS, NULL); + ret = coresight_enable_path(path, CS_MODE_SYSFS); if (ret) goto err_path; diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c index 0f45ab5e5249..8882b1c4cdc0 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etf.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c @@ -246,13 +246,14 @@ out: return ret; } -static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data) +static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, + struct coresight_path *path) { int ret = 0; pid_t pid; unsigned long flags; struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -304,7 +305,8 @@ static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, void *data) } static int tmc_enable_etf_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { int ret; @@ -313,7 +315,7 @@ static int tmc_enable_etf_sink(struct coresight_device *csdev, ret = tmc_enable_etf_sink_sysfs(csdev); break; case CS_MODE_PERF: - ret = tmc_enable_etf_sink_perf(csdev, data); + ret = tmc_enable_etf_sink_perf(csdev, path); break; /* We shouldn't be here */ default: diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 51c6f73dd15c..e0d83ee01b77 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1733,13 +1733,14 @@ out: return size; } -static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data) +static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, + struct coresight_path *path) { int rc = 0; pid_t pid; unsigned long flags; struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct etr_perf_buffer *etr_perf = etm_perf_sink_config(handle); raw_spin_lock_irqsave(&drvdata->spinlock, flags); @@ -1787,13 +1788,14 @@ unlock_out: } static int tmc_enable_etr_sink(struct coresight_device *csdev, - enum cs_mode mode, void *data) + enum cs_mode mode, + struct coresight_path *path) { switch (mode) { case CS_MODE_SYSFS: return tmc_enable_etr_sink_sysfs(csdev); case CS_MODE_PERF: - return tmc_enable_etr_sink_perf(csdev, data); + return tmc_enable_etr_sink_perf(csdev, path); default: return -EINVAL; } diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 9463afdbda8a..aaa44bc521c3 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -75,7 +75,7 @@ static void tpiu_enable_hw(struct csdev_access *csa) } static int tpiu_enable(struct coresight_device *csdev, enum cs_mode mode, - void *__unused) + struct coresight_path *path) { struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 43643d2c5bdd..293715b4ff0e 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -1013,11 +1013,11 @@ err: } static int arm_trbe_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct trbe_buf *buf = etm_perf_sink_config(handle); WARN_ON(cpudata->cpu != smp_processor_id()); diff --git a/drivers/hwtracing/coresight/ultrasoc-smb.c b/drivers/hwtracing/coresight/ultrasoc-smb.c index 26cfc939e5bd..8f7922a5e534 100644 --- a/drivers/hwtracing/coresight/ultrasoc-smb.c +++ b/drivers/hwtracing/coresight/ultrasoc-smb.c @@ -213,10 +213,11 @@ static void smb_enable_sysfs(struct coresight_device *csdev) coresight_set_mode(csdev, CS_MODE_SYSFS); } -static int smb_enable_perf(struct coresight_device *csdev, void *data) +static int smb_enable_perf(struct coresight_device *csdev, + struct coresight_path *path) { struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent); - struct perf_output_handle *handle = data; + struct perf_output_handle *handle = path->handle; struct cs_buffers *buf = etm_perf_sink_config(handle); pid_t pid; @@ -240,7 +241,7 @@ static int smb_enable_perf(struct coresight_device *csdev, void *data) } static int smb_enable(struct coresight_device *csdev, enum cs_mode mode, - void *data) + struct coresight_path *path) { struct smb_drv_data *drvdata = dev_get_drvdata(csdev->dev.parent); int ret = 0; @@ -261,7 +262,7 @@ static int smb_enable(struct coresight_device *csdev, enum cs_mode mode, smb_enable_sysfs(csdev); break; case CS_MODE_PERF: - ret = smb_enable_perf(csdev, data); + ret = smb_enable_perf(csdev, path); break; default: ret = -EINVAL; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2bee2e3bb1c6..56d0108658db 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -367,7 +367,7 @@ enum cs_mode { */ struct coresight_ops_sink { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); + struct coresight_path *path); int (*disable)(struct coresight_device *csdev); void *(*alloc_buffer)(struct coresight_device *csdev, struct perf_event *event, void **pages, -- cgit v1.2.3 From 22ea7b9d96e26147b7a3ea1be7aa106cc700907c Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:11 +0100 Subject: platform/x86: asus-wmi: export symbols used for read/write WMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export symbols for reading/writing WMI symbols using a namespace. Existing functions: - asus_wmi_evaluate_method - asus_wmi_set_devstate New function: - asus_wmi_get_devstate_dsts The new function is intended for use with DSTS WMI method only and avoids requiring the asus_wmi driver data to select the WMI method. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20251102215319.3126879-2-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 46 ++++++++++++++++++++++++++++-- include/linux/platform_data/x86/asus-wmi.h | 5 ++++ 2 files changed, 48 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index e72a2b5d158e..c3e90517ce0f 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -390,7 +390,7 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { return asus_wmi_evaluate_method3(method_id, arg0, arg1, 0, retval); } -EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method); +EXPORT_SYMBOL_NS_GPL(asus_wmi_evaluate_method, "ASUS_WMI"); static int asus_wmi_evaluate_method5(u32 method_id, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 *retval) @@ -554,12 +554,52 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval) return 0; } -int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, - u32 *retval) +/** + * asus_wmi_get_devstate_dsts() - Get the WMI function state. + * @dev_id: The WMI method ID to call. + * @retval: A pointer to where to store the value returned from WMI. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) +{ + int err; + + err = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, retval); + if (err) + return err; + + if ((*retval & ASUS_WMI_DSTS_PRESENCE_BIT) == 0x00) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(asus_wmi_get_devstate_dsts, "ASUS_WMI"); + +/** + * asus_wmi_set_devstate() - Set the WMI function state. + * + * Note: an asus_wmi_set_devstate() call must be paired with a + * asus_wmi_get_devstate_dsts() to check if the WMI function is supported. + * + * @dev_id: The WMI function to call. + * @ctrl_param: The argument to be used for this WMI function. + * @retval: A pointer to where to store the value returned from WMI. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) { return asus_wmi_evaluate_method(ASUS_WMI_METHODID_DEVS, dev_id, ctrl_param, retval); } +EXPORT_SYMBOL_NS_GPL(asus_wmi_set_devstate, "ASUS_WMI"); /* Helper for special devices with magic return codes */ static int asus_wmi_get_devstate_bits(struct asus_wmi *asus, diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8a515179113d..dbd44d9fbb6f 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -166,6 +166,7 @@ enum asus_ally_mcu_hack { #if IS_REACHABLE(CONFIG_ASUS_WMI) void set_ally_mcu_hack(enum asus_ally_mcu_hack status); void set_ally_mcu_powersave(bool enabled); +int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval); int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval); int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval); #else @@ -179,6 +180,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) { return -ENODEV; } +static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) +{ + return -ENODEV; +} static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { -- cgit v1.2.3 From 4f739ed19d222de33b19ca639a34523fbbec20d0 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 14 Oct 2025 07:51:56 +0200 Subject: rv: Pass va_list to reactors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only thing the reactors can do with the passed in varargs is to convert it into a va_list. Do that in a central helper instead. It simplifies the reactors, removes some hairy macro-generated code and introduces a convenient hook point to modify reactor behavior. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-1-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 11 +++++++++-- include/rv/da_monitor.h | 35 ++++++++++------------------------- include/rv/ltl_monitor.h | 18 +++++------------- kernel/trace/rv/reactor_panic.c | 6 +----- kernel/trace/rv/reactor_printk.c | 6 +----- kernel/trace/rv/rv_reactors.c | 16 +++++++++++++++- 6 files changed, 41 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 9520aab34bcb..b567b0191e67 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -88,7 +88,7 @@ union rv_task_monitor { struct rv_reactor { const char *name; const char *description; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); struct list_head list; }; #endif @@ -102,7 +102,7 @@ struct rv_monitor { void (*reset)(void); #ifdef CONFIG_RV_REACTORS struct rv_reactor *reactor; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); #endif struct list_head list; struct rv_monitor *parent; @@ -119,11 +119,18 @@ void rv_put_task_monitor_slot(int slot); bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); +__printf(2, 3) +void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else static inline bool rv_reacting_on(void) { return false; } + +__printf(2, 3) +static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ +} #endif /* CONFIG_RV_REACTORS */ #endif /* CONFIG_RV */ diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 17fa4f6e5ea6..0cef64366538 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -16,34 +16,19 @@ #include #include -#ifdef CONFIG_RV_REACTORS - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - if (!rv_reacting_on() || !rv_##name.react) \ - return; \ - rv_##name.react("rv: monitor %s does not allow event %s on state %s\n", \ - #name, \ - model_get_event_name_##name(event), \ - model_get_state_name_##name(curr_state)); \ -} - -#else /* CONFIG_RV_REACTOR */ - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - return; \ -} -#endif - /* * Generic helpers for all types of deterministic automata monitors. */ #define DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ \ -DECLARE_RV_REACTING_HELPERS(name, type) \ +static void react_##name(type curr_state, type event) \ +{ \ + rv_react(&rv_##name, \ + "rv: monitor %s does not allow event %s on state %s\n", \ + #name, \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(curr_state)); \ +} \ \ /* \ * da_monitor_reset_##name - reset a monitor and setting it to init state \ @@ -126,7 +111,7 @@ da_event_##name(struct da_monitor *da_mon, enum events_##name event) \ for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ return false; \ @@ -165,7 +150,7 @@ static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(tsk->pid, \ model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h index 5368cf5fd623..00c42b36f961 100644 --- a/include/rv/ltl_monitor.h +++ b/include/rv/ltl_monitor.h @@ -16,21 +16,12 @@ #error "Please include $(MODEL_NAME).h generated by rvgen" #endif -#ifdef CONFIG_RV_REACTORS #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME) -static struct rv_monitor RV_MONITOR_NAME; -static void rv_cond_react(struct task_struct *task) -{ - if (!rv_reacting_on() || !RV_MONITOR_NAME.react) - return; - RV_MONITOR_NAME.react("rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", - task->comm, task->pid); -} +#ifdef CONFIG_RV_REACTORS +static struct rv_monitor RV_MONITOR_NAME; #else -static void rv_cond_react(struct task_struct *task) -{ -} +extern struct rv_monitor RV_MONITOR_NAME; #endif static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT; @@ -98,7 +89,8 @@ static void ltl_monitor_destroy(void) static void ltl_illegal_state(struct task_struct *task, struct ltl_monitor *mon) { CONCATENATE(trace_error_, MONITOR_NAME)(task); - rv_cond_react(task); + rv_react(&RV_MONITOR_NAME, "rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", + task->comm, task->pid); } static void ltl_attempt_start(struct task_struct *task, struct ltl_monitor *mon) diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 74c6bcc2c749..76537b8a4343 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,13 +13,9 @@ #include #include -__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) +__printf(1, 0) static void rv_panic_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vpanic(msg, args); - va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 2dae2916c05f..48c934e315b3 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,13 +12,9 @@ #include #include -__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) +__printf(1, 0) static void rv_printk_reaction(const char *msg, va_list args) { - va_list args; - - va_start(args, msg); vprintk_deferred(msg, args); - va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index d32859fec238..cb1a5968055a 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -438,7 +438,7 @@ int reactor_populate_monitor(struct rv_monitor *mon) /* * Nop reactor register */ -__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) +__printf(1, 0) static void rv_nop_reaction(const char *msg, va_list args) { } @@ -477,3 +477,17 @@ rm_available: out_err: return -ENOMEM; } + +void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ + va_list args; + + if (!rv_reacting_on() || !monitor->react) + return; + + va_start(args, msg); + + monitor->react(msg, args); + + va_end(args); +} -- cgit v1.2.3 From 68f63cea46d3a410a41d9ab74d338038a22bc2ad Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 14 Oct 2025 07:51:57 +0200 Subject: rv: Make rv_reacting_on() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no external users left. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-2-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ------ kernel/trace/rv/rv_reactors.c | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index b567b0191e67..92fd467547e7 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -116,17 +116,11 @@ int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); #ifdef CONFIG_RV_REACTORS -bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); __printf(2, 3) void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else -static inline bool rv_reacting_on(void) -{ - return false; -} - __printf(2, 3) static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) { diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index cb1a5968055a..8c02426bc3bd 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -347,7 +347,7 @@ static bool __read_mostly reacting_on; * * Returns 1 if on, 0 otherwise. */ -bool rv_reacting_on(void) +static bool rv_reacting_on(void) { /* Ensures that concurrent monitors read consistent reacting_on */ smp_rmb(); -- cgit v1.2.3 From cb46a58d77e5b433e9f4538faaa2a73970157e8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Oct 2025 03:44:04 -0700 Subject: efi/memattr: Convert efi_memattr_init() return type to void The efi_memattr_init() function's return values (0 and -ENOMEM) are never checked by callers. Convert the function to return void since the return status is unused. Signed-off-by: Breno Leitao Acked-by: Ard Biesheuvel Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/memattr.c | 7 +++---- include/linux/efi.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index c38b1a335590..e727cc5909cb 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -19,19 +19,19 @@ unsigned long __ro_after_init efi_mem_attr_table = EFI_INVALID_TABLE_ADDR; * Reserve the memory associated with the Memory Attributes configuration * table, if it exists. */ -int __init efi_memattr_init(void) +void __init efi_memattr_init(void) { efi_memory_attributes_table_t *tbl; unsigned long size; if (efi_mem_attr_table == EFI_INVALID_TABLE_ADDR) - return 0; + return; tbl = early_memremap(efi_mem_attr_table, sizeof(*tbl)); if (!tbl) { pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", efi_mem_attr_table); - return -ENOMEM; + return; } if (tbl->version > 2) { @@ -61,7 +61,6 @@ int __init efi_memattr_init(void) unmap: early_memunmap(tbl, sizeof(*tbl)); - return 0; } /* diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..0b9eb3d2ff97 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -772,7 +772,7 @@ extern unsigned long efi_mem_attr_table; */ typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *, bool); -extern int efi_memattr_init(void); +extern void efi_memattr_init(void); extern int efi_memattr_apply_permissions(struct mm_struct *mm, efi_memattr_perm_setter fn); -- cgit v1.2.3 From 693d1eaca940f277af24c74873ef2313816ff444 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 11 Nov 2025 18:58:35 +0000 Subject: coresight: Change device mode to atomic type The device mode is defined as local type. This type cannot promise SMP-safe access. Change to atomic type and impose relax ordering, which ensures the SMP-safe synchronisation and the ordering between the mode setting and relevant operations. Fixes: 22fd532eaa0c ("coresight: etm3x: adding operation mode for etm_enable()") Reviewed-by: Mike Leach Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20251111-arm_coresight_power_management_fix-v6-1-f55553b6c8b3@arm.com --- include/linux/coresight.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 56d0108658db..2b48be97fcd0 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -251,15 +251,11 @@ struct coresight_trace_id_map { * by @coresight_ops. * @access: Device i/o access abstraction for this device. * @dev: The device entity associated to this component. - * @mode: This tracer's mode, i.e sysFS, Perf or disabled. This is - * actually an 'enum cs_mode', but is stored in an atomic type. - * This is always accessed through local_read() and local_set(), - * but wherever it's done from within the Coresight device's lock, - * a non-atomic read would also work. This is the main point of - * synchronisation between code happening inside the sysfs mode's - * coresight_mutex and outside when running in Perf mode. A compare - * and exchange swap is done to atomically claim one mode or the - * other. + * @mode: The device mode, i.e sysFS, Perf or disabled. This is actually + * an 'enum cs_mode' but stored in an atomic type. Access is always + * through atomic APIs, ensuring SMP-safe synchronisation between + * racing from sysFS and Perf mode. A compare-and-exchange + * operation is done to atomically claim one mode or the other. * @refcnt: keep track of what is in use. Only access this outside of the * device's spinlock when the coresight_mutex held and mode == * CS_MODE_SYSFS. Otherwise it must be accessed from inside the @@ -288,7 +284,7 @@ struct coresight_device { const struct coresight_ops *ops; struct csdev_access access; struct device dev; - local_t mode; + atomic_t mode; int refcnt; bool orphan; /* sink specific fields */ @@ -624,13 +620,14 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) static inline bool coresight_take_mode(struct coresight_device *csdev, enum cs_mode new_mode) { - return local_cmpxchg(&csdev->mode, CS_MODE_DISABLED, new_mode) == - CS_MODE_DISABLED; + int curr = CS_MODE_DISABLED; + + return atomic_try_cmpxchg_acquire(&csdev->mode, &curr, new_mode); } static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev) { - return local_read(&csdev->mode); + return atomic_read_acquire(&csdev->mode); } static inline void coresight_set_mode(struct coresight_device *csdev, @@ -646,7 +643,7 @@ static inline void coresight_set_mode(struct coresight_device *csdev, WARN(new_mode != CS_MODE_DISABLED && current_mode != CS_MODE_DISABLED && current_mode != new_mode, "Device already in use\n"); - local_set(&csdev->mode, new_mode); + atomic_set_release(&csdev->mode, new_mode); } struct coresight_device *coresight_register(struct coresight_desc *desc); -- cgit v1.2.3 From 395b95530343e7f4bdd2870190d985a222997fb6 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:07 +0100 Subject: dcache: export shrink_dentry_list() and add new helper d_dispose_if_unused() Add and export a new helper d_dispose_if_unused() which is simply a wrapper around to_shrink_list(), to add an entry to a dispose list if it's not used anymore. Also export shrink_dentry_list() to kill all dentries in a dispose list. Suggested-by: Miklos Szeredi Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- fs/dcache.c | 18 ++++++++++++------ include/linux/dcache.h | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..bffb1b47a907 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1086,6 +1086,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1096,12 +1105,8 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, &dispose); - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1141,6 +1146,7 @@ void shrink_dentry_list(struct list_head *list) shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..2bc1339bf6d0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); -- cgit v1.2.3 From f99eb098090e4c8bfca4190b545e20450fee8250 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:12 +0100 Subject: platform/x86: asus-armoury: move existing tunings to asus-armoury module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fw_attributes_class provides a much cleaner interface to all of the attributes introduced to asus-wmi. This patch moves all of these extra attributes over to fw_attributes_class, and shifts the bulk of these definitions to a new kernel module to reduce the clutter of asus-wmi with the intention of deprecating the asus-wmi attributes in future. The work applies only to WMI methods which don't have a clearly defined place within the sysfs and as a result ended up lumped together in /sys/devices/platform/asus-nb-wmi/ with no standard API. Where possible the fw attrs now implement defaults, min, max, scalar, choices, etc. As en example dgpu_disable becomes: /sys/class/firmware-attributes/asus-armoury/attributes/dgpu_disable/ ├── current_value ├── display_name ├── possible_values └── type as do other attributes. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-3-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/hid/hid-asus.c | 1 + drivers/platform/x86/Kconfig | 12 + drivers/platform/x86/Makefile | 1 + drivers/platform/x86/asus-armoury.c | 763 +++++++++++++++++++++ drivers/platform/x86/asus-armoury.h | 200 ++++++ drivers/platform/x86/asus-wmi.c | 10 +- .../linux/platform_data/x86/asus-wmi-leds-ids.h | 50 ++ include/linux/platform_data/x86/asus-wmi.h | 44 +- 8 files changed, 1034 insertions(+), 47 deletions(-) create mode 100644 drivers/platform/x86/asus-armoury.c create mode 100644 drivers/platform/x86/asus-armoury.h create mode 100644 include/linux/platform_data/x86/asus-wmi-leds-ids.h (limited to 'include/linux') diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index a444d41e53b6..472bca54642b 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* For to_usb_interface for T100 touchpad intf check */ #include diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1e9b84f1098f..ba0806b48bb9 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -264,6 +264,18 @@ config ASUS_WIRELESS If you choose to compile this driver as a module the module will be called asus-wireless. +config ASUS_ARMOURY + tristate "ASUS Armoury driver" + depends on ASUS_WMI + select FW_ATTR_CLASS + help + Say Y here if you have a WMI aware Asus machine and would like to use the + firmware_attributes API to control various settings typically exposed in + the ASUS Armoury Crate application available on Windows. + + To compile this driver as a module, choose M here: the module will + be called asus-armoury. + config ASUS_WMI tristate "ASUS WMI Driver" depends on ACPI_WMI diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index d722e244a4a7..d59a2ed5932c 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o # ASUS obj-$(CONFIG_ASUS_LAPTOP) += asus-laptop.o obj-$(CONFIG_ASUS_WIRELESS) += asus-wireless.o +obj-$(CONFIG_ASUS_ARMOURY) += asus-armoury.o obj-$(CONFIG_ASUS_WMI) += asus-wmi.o obj-$(CONFIG_ASUS_NB_WMI) += asus-nb-wmi.o obj-$(CONFIG_ASUS_TF103C_DOCK) += asus-tf103c-dock.o diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c new file mode 100644 index 000000000000..81b4972df818 --- /dev/null +++ b/drivers/platform/x86/asus-armoury.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Asus Armoury (WMI) attributes driver. + * + * This driver uses the fw_attributes class to expose various WMI functions + * that are present in many gaming and some non-gaming ASUS laptops. + * + * These typically don't fit anywhere else in the sysfs such as under LED class, + * hwmon or others, and are set in Windows using the ASUS Armoury Crate tool. + * + * Copyright(C) 2024 Luke Jones + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "asus-armoury.h" +#include "firmware_attributes_class.h" + +#define ASUS_NB_WMI_EVENT_GUID "0B3CBB35-E3C2-45ED-91C2-4C5A6D195D1C" + +#define ASUS_MINI_LED_MODE_MASK GENMASK(1, 0) +/* Standard modes for devices with only on/off */ +#define ASUS_MINI_LED_OFF 0x00 +#define ASUS_MINI_LED_ON 0x01 +/* Like "on" but the effect is more vibrant or brighter */ +#define ASUS_MINI_LED_STRONG_MODE 0x02 +/* New modes for devices with 3 mini-led mode types */ +#define ASUS_MINI_LED_2024_WEAK 0x00 +#define ASUS_MINI_LED_2024_STRONG 0x01 +#define ASUS_MINI_LED_2024_OFF 0x02 + +struct asus_armoury_priv { + struct device *fw_attr_dev; + struct kset *fw_attr_kset; + + /* + * Mutex to protect eGPU activation/deactivation + * sequences and dGPU connection status: + * do not allow concurrent changes or changes + * before a reboot if dGPU got disabled. + */ + struct mutex egpu_mutex; + + u32 mini_led_dev_id; + u32 gpu_mux_dev_id; +}; + +static struct asus_armoury_priv asus_armoury = { + .egpu_mutex = __MUTEX_INITIALIZER(asus_armoury.egpu_mutex), +}; + +struct fw_attrs_group { + bool pending_reboot; +}; + +static struct fw_attrs_group fw_attrs = { + .pending_reboot = false, +}; + +struct asus_attr_group { + const struct attribute_group *attr_group; + u32 wmi_devid; +}; + +static void asus_set_reboot_and_signal_event(void) +{ + fw_attrs.pending_reboot = true; + kobject_uevent(&asus_armoury.fw_attr_dev->kobj, KOBJ_CHANGE); +} + +static ssize_t pending_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", fw_attrs.pending_reboot); +} + +static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot); + +static bool asus_bios_requires_reboot(struct kobj_attribute *attr) +{ + return !strcmp(attr->attr.name, "gpu_mux_mode"); +} + +/** + * armoury_has_devstate() - Check presence of the WMI function state. + * + * @dev_id: The WMI method ID to check for presence. + * + * Returns: true iif method is supported. + */ +static bool armoury_has_devstate(u32 dev_id) +{ + u32 retval; + int status; + + status = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, &retval); + pr_debug("%s called (0x%08x), retval: 0x%08x\n", __func__, dev_id, retval); + + return status == 0 && (retval & ASUS_WMI_DSTS_PRESENCE_BIT); +} + +/** + * armoury_get_devstate() - Get the WMI function state. + * @attr: NULL or the kobj_attribute associated to called WMI function. + * @dev_id: The WMI method ID to call. + * @retval: + * * non-NULL pointer to where to store the value returned from WMI + * * with the function presence bit cleared. + * + * Intended usage is from sysfs attribute checking associated WMI function. + * + * Returns: + * * %-ENODEV - method ID is unsupported. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 dev_id) +{ + int err; + + err = asus_wmi_get_devstate_dsts(dev_id, retval); + if (err) { + if (attr) + pr_err("Failed to get %s: %d\n", attr->attr.name, err); + else + pr_err("Failed to get devstate for 0x%x: %d\n", dev_id, err); + + return err; + } + + /* + * asus_wmi_get_devstate_dsts will populate retval with WMI return, but + * the true value is expressed when ASUS_WMI_DSTS_PRESENCE_BIT is clear. + */ + *retval &= ~ASUS_WMI_DSTS_PRESENCE_BIT; + + return 0; +} + +/** + * armoury_set_devstate() - Set the WMI function state. + * @attr: The kobj_attribute associated to called WMI function. + * @dev_id: The WMI method ID to call. + * @value: The new value to be set. + * @retval: Where to store the value returned from WMI or NULL. + * + * Intended usage is from sysfs attribute setting associated WMI function. + * Before calling the presence of the function should be checked. + * + * Every WMI write MUST go through this function to enforce safety checks. + * + * Results !1 is usually considered a fail by ASUS, but some WMI methods + * (like eGPU or CPU cores) do use > 1 to return a status code or similar: + * in these cases caller is interested in the actual return value + * and should perform relevant checks. + * + * Returns: + * * %-EIO - WMI function returned an error. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. + */ +static int armoury_set_devstate(struct kobj_attribute *attr, + u32 value, u32 *retval, u32 dev_id) +{ + u32 result; + int err; + + err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result); + if (err) { + if (attr) + pr_err("Failed to set %s: %d\n", attr->attr.name, err); + else + pr_err("Failed to set devstate for 0x%x: %d\n", dev_id, err); + + return err; + } + + /* + * If retval == NULL caller is uninterested in return value: + * perform the most common result check here. + */ + if ((retval == NULL) && (result == 0)) { + pr_err("Failed to set %s: (result): 0x%x\n", attr->attr.name, result); + return -EIO; + } + + return 0; +} + +static int armoury_attr_enum_list(char *buf, size_t enum_values) +{ + size_t i; + int len = 0; + + for (i = 0; i < enum_values; i++) { + if (i == 0) + len += sysfs_emit_at(buf, len, "%zu", i); + else + len += sysfs_emit_at(buf, len, ";%zu", i); + } + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} + +ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count, u32 min, u32 max, + u32 *store_value, u32 wmi_dev) +{ + u32 value; + int err; + + err = kstrtou32(buf, 10, &value); + if (err) + return err; + + if (value < min || value > max) + return -EINVAL; + + err = armoury_set_devstate(attr, value, NULL, wmi_dev); + if (err) + return err; + + if (store_value != NULL) + *store_value = value; + sysfs_notify(kobj, NULL, attr->attr.name); + + if (asus_bios_requires_reboot(attr)) + asus_set_reboot_and_signal_event(); + + return count; +} + +ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf, u32 wmi_dev) +{ + u32 result; + int err; + + err = armoury_get_devstate(attr, &result, wmi_dev); + if (err) + return err; + + return sysfs_emit(buf, "%u\n", result); +} + +static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "enumeration\n"); +} + +/* Mini-LED mode **************************************************************/ + +/* Values map for mini-led modes on 2023 and earlier models. */ +static u32 mini_led_mode1_map[] = { + [0] = ASUS_MINI_LED_OFF, + [1] = ASUS_MINI_LED_ON, +}; + +/* Values map for mini-led modes on 2024 and later models. */ +static u32 mini_led_mode2_map[] = { + [0] = ASUS_MINI_LED_2024_OFF, + [1] = ASUS_MINI_LED_2024_WEAK, + [2] = ASUS_MINI_LED_2024_STRONG, +}; + +static ssize_t mini_led_mode_current_value_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + u32 *mini_led_mode_map; + size_t mini_led_mode_map_size; + u32 i, mode; + int err; + + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + mini_led_mode_map = mini_led_mode1_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map); + break; + + case ASUS_WMI_DEVID_MINI_LED_MODE2: + mini_led_mode_map = mini_led_mode2_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map); + break; + + default: + pr_err("Unrecognized mini-LED device: %u\n", asus_armoury.mini_led_dev_id); + return -ENODEV; + } + + err = armoury_get_devstate(attr, &mode, asus_armoury.mini_led_dev_id); + if (err) + return err; + + mode = FIELD_GET(ASUS_MINI_LED_MODE_MASK, 0); + + for (i = 0; i < mini_led_mode_map_size; i++) + return sysfs_emit(buf, "%u\n", mini_led_mode_map[i]); + + pr_warn("Unrecognized mini-LED mode: %u", mode); + return -EINVAL; +} + +static ssize_t mini_led_mode_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + u32 *mini_led_mode_map; + size_t mini_led_mode_map_size; + u32 mode; + int err; + + err = kstrtou32(buf, 10, &mode); + if (err) + return err; + + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + mini_led_mode_map = mini_led_mode1_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode1_map); + break; + + case ASUS_WMI_DEVID_MINI_LED_MODE2: + mini_led_mode_map = mini_led_mode2_map; + mini_led_mode_map_size = ARRAY_SIZE(mini_led_mode2_map); + break; + + default: + pr_err("Unrecognized mini-LED devid: %u\n", asus_armoury.mini_led_dev_id); + return -EINVAL; + } + + if (mode >= mini_led_mode_map_size) { + return pr_warn("mini-LED mode unrecognized device: %u\n", mode); + return -ENODEV; + } + + return armoury_attr_uint_store(kobj, attr, buf, count, + 0, mini_led_mode_map[mode], + NULL, asus_armoury.mini_led_dev_id); +} + +static ssize_t mini_led_mode_possible_values_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + switch (asus_armoury.mini_led_dev_id) { + case ASUS_WMI_DEVID_MINI_LED_MODE: + return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode1_map)); + case ASUS_WMI_DEVID_MINI_LED_MODE2: + return armoury_attr_enum_list(buf, ARRAY_SIZE(mini_led_mode2_map)); + default: + return -ENODEV; + } +} +ASUS_ATTR_GROUP_ENUM(mini_led_mode, "mini_led_mode", "Set the mini-LED backlight mode"); + +static ssize_t gpu_mux_mode_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int result, err; + bool optimus; + + err = kstrtobool(buf, &optimus); + if (err) + return err; + + if (armoury_has_devstate(ASUS_WMI_DEVID_DGPU)) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_DGPU); + if (err) + return err; + if (result && !optimus) { + pr_warn("Cannot switch MUX to dGPU mode when dGPU is disabled: %02X\n", + result); + return -ENODEV; + } + } + + if (armoury_has_devstate(ASUS_WMI_DEVID_EGPU)) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU); + if (err) + return err; + if (result && !optimus) { + pr_warn("Cannot switch MUX to dGPU mode when eGPU is enabled\n"); + return -EBUSY; + } + } + + err = armoury_set_devstate(attr, optimus ? 1 : 0, NULL, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + + sysfs_notify(kobj, NULL, attr->attr.name); + asus_set_reboot_and_signal_event(); + + return count; +} +ASUS_WMI_SHOW_INT(gpu_mux_mode_current_value, asus_armoury.gpu_mux_dev_id); +ASUS_ATTR_GROUP_BOOL(gpu_mux_mode, "gpu_mux_mode", "Set the GPU display MUX mode"); + +static ssize_t dgpu_disable_current_value_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + int result, err; + bool disable; + + err = kstrtobool(buf, &disable); + if (err) + return err; + + if (asus_armoury.gpu_mux_dev_id) { + err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + if (!result && disable) { + pr_warn("Cannot disable dGPU when the MUX is in dGPU mode\n"); + return -EBUSY; + } + } + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + err = armoury_set_devstate(attr, disable ? 1 : 0, NULL, ASUS_WMI_DEVID_DGPU); + if (err) + return err; + } + + sysfs_notify(kobj, NULL, attr->attr.name); + + return count; +} +ASUS_WMI_SHOW_INT(dgpu_disable_current_value, ASUS_WMI_DEVID_DGPU); +ASUS_ATTR_GROUP_BOOL(dgpu_disable, "dgpu_disable", "Disable the dGPU"); + +/* Values map for eGPU activation requests. */ +static u32 egpu_status_map[] = { + [0] = 0x00000000U, + [1] = 0x00000001U, + [2] = 0x00000101U, + [3] = 0x00000201U, +}; + +/* + * armoury_pci_rescan() - Performs a PCI rescan + * + * Bring up any GPU that has been hotplugged in the system. + */ +static void armoury_pci_rescan(void) +{ + struct pci_bus *b = NULL; + + pci_lock_rescan_remove(); + while ((b = pci_find_next_bus(b)) != NULL) + pci_rescan_bus(b); + pci_unlock_rescan_remove(); +} + +/* + * The ACPI call to enable the eGPU might also disable the internal dGPU, + * but this is not always the case and on certain models enabling the eGPU + * when the dGPU is either still active or has been disabled without rebooting + * will make both GPUs malfunction and the kernel will detect many + * PCI AER unrecoverable errors. + */ +static ssize_t egpu_enable_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + u32 requested, enable, result; + + err = kstrtou32(buf, 10, &requested); + if (err) + return err; + + if (requested >= ARRAY_SIZE(egpu_status_map)) + return -EINVAL; + enable = egpu_status_map[requested]; + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + /* Ensure the eGPU is connected before attempting to activate it. */ + if (enable) { + err = armoury_get_devstate(NULL, &result, ASUS_WMI_DEVID_EGPU_CONNECTED); + if (err) { + pr_warn("Failed to get eGPU connection status: %d\n", err); + return err; + } + if (!result) { + pr_warn("Cannot activate eGPU while undetected\n"); + return -ENOENT; + } + } + + if (asus_armoury.gpu_mux_dev_id) { + err = armoury_get_devstate(NULL, &result, asus_armoury.gpu_mux_dev_id); + if (err) + return err; + + if (!result && enable) { + pr_warn("Cannot enable eGPU when the MUX is in dGPU mode\n"); + return -ENODEV; + } + } + + err = armoury_set_devstate(attr, enable, &result, ASUS_WMI_DEVID_EGPU); + if (err) { + pr_err("Failed to set %s: %d\n", attr->attr.name, err); + return err; + } + + /* + * ACPI returns value 0x01 on success and 0x02 on a partial activation: + * performing a pci rescan will bring up the device in pci-e 3.0 speed, + * after a reboot the device will work at full speed. + */ + switch (result) { + case 0x01: + /* + * When a GPU is in use it does not get disconnected even if + * the ACPI call returns a success. + */ + if (!enable) { + err = armoury_get_devstate(attr, &result, ASUS_WMI_DEVID_EGPU); + if (err) { + pr_warn("Failed to ensure eGPU is deactivated: %d\n", err); + return err; + } + + if (result != 0) + return -EBUSY; + } + + pr_debug("Success changing the eGPU status\n"); + break; + case 0x02: + pr_info("Success changing the eGPU status, a reboot is strongly advised\n"); + asus_set_reboot_and_signal_event(); + break; + default: + pr_err("Failed to change the eGPU status: wmi result is 0x%x\n", result); + return -EIO; + } + } + + /* + * Perform a PCI rescan: on every tested model this is necessary + * to make the eGPU visible on the bus without rebooting. + */ + armoury_pci_rescan(); + + sysfs_notify(kobj, NULL, attr->attr.name); + + return count; +} + +static ssize_t egpu_enable_current_value_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i, err; + u32 status; + + scoped_guard(mutex, &asus_armoury.egpu_mutex) { + err = armoury_get_devstate(attr, &status, ASUS_WMI_DEVID_EGPU); + if (err) + return err; + } + + for (i = 0; i < ARRAY_SIZE(egpu_status_map); i++) { + if (egpu_status_map[i] == status) + return sysfs_emit(buf, "%u\n", i); + } + + return -EIO; +} + +static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return armoury_attr_enum_list(buf, ARRAY_SIZE(egpu_status_map)); +} +ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)"); + +/* Simple attribute creation */ +ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", + "Show the current mode of charging"); +ASUS_ATTR_GROUP_BOOL_RW(boot_sound, "boot_sound", ASUS_WMI_DEVID_BOOT_SOUND, + "Set the boot POST sound"); +ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWERSAVE, + "Set MCU powersaving mode"); +ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, + "Set the panel refresh overdrive"); +ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, + "Show the eGPU connection status"); + +/* If an attribute does not require any special case handling add it here */ +static const struct asus_attr_group armoury_attr_groups[] = { + { &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED }, + { &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU }, + { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, + + { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, + { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, + { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, + { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, +}; + +static int asus_fw_attr_add(void) +{ + int err, i; + + asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), + NULL, "%s", DRIVER_NAME); + if (IS_ERR(asus_armoury.fw_attr_dev)) { + err = PTR_ERR(asus_armoury.fw_attr_dev); + goto fail_class_get; + } + + asus_armoury.fw_attr_kset = kset_create_and_add("attributes", NULL, + &asus_armoury.fw_attr_dev->kobj); + if (!asus_armoury.fw_attr_kset) { + err = -ENOMEM; + goto err_destroy_classdev; + } + + err = sysfs_create_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); + if (err) { + pr_err("Failed to create sysfs level attributes\n"); + goto err_destroy_kset; + } + + asus_armoury.mini_led_dev_id = 0; + if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE)) + asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; + else if (armoury_has_devstate(ASUS_WMI_DEVID_MINI_LED_MODE2)) + asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2; + + if (asus_armoury.mini_led_dev_id) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + &mini_led_mode_attr_group); + if (err) { + pr_err("Failed to create sysfs-group for mini_led\n"); + goto err_remove_file; + } + } + + asus_armoury.gpu_mux_dev_id = 0; + if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX)) + asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX; + else if (armoury_has_devstate(ASUS_WMI_DEVID_GPU_MUX_VIVO)) + asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX_VIVO; + + if (asus_armoury.gpu_mux_dev_id) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + &gpu_mux_mode_attr_group); + if (err) { + pr_err("Failed to create sysfs-group for gpu_mux\n"); + goto err_remove_mini_led_group; + } + } + + for (i = 0; i < ARRAY_SIZE(armoury_attr_groups); i++) { + if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + continue; + + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + if (err) { + pr_err("Failed to create sysfs-group for %s\n", + armoury_attr_groups[i].attr_group->name); + goto err_remove_groups; + } + } + + return 0; + +err_remove_groups: + while (i--) { + if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + } + if (asus_armoury.gpu_mux_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group); +err_remove_mini_led_group: + if (asus_armoury.mini_led_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group); +err_remove_file: + sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); +err_destroy_kset: + kset_unregister(asus_armoury.fw_attr_kset); +err_destroy_classdev: +fail_class_get: + device_destroy(&firmware_attributes_class, MKDEV(0, 0)); + return err; +} + +/* Init / exit ****************************************************************/ + +static int __init asus_fw_init(void) +{ + char *wmi_uid; + + wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID); + if (!wmi_uid) + return -ENODEV; + + /* + * if equal to "ASUSWMI" then it's DCTS that can't be used for this + * driver, DSTS is required. + */ + if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) + return -ENODEV; + + return asus_fw_attr_add(); +} + +static void __exit asus_fw_exit(void) +{ + int i; + + for (i = ARRAY_SIZE(armoury_attr_groups) - 1; i >= 0; i--) { + if (armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + } + + if (asus_armoury.gpu_mux_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group); + + if (asus_armoury.mini_led_dev_id) + sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group); + + sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); + kset_unregister(asus_armoury.fw_attr_kset); + device_destroy(&firmware_attributes_class, MKDEV(0, 0)); +} + +module_init(asus_fw_init); +module_exit(asus_fw_exit); + +MODULE_IMPORT_NS("ASUS_WMI"); +MODULE_AUTHOR("Luke Jones "); +MODULE_DESCRIPTION("ASUS BIOS Configuration Driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("wmi:" ASUS_NB_WMI_EVENT_GUID); diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h new file mode 100644 index 000000000000..3a2a674a1b55 --- /dev/null +++ b/drivers/platform/x86/asus-armoury.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Definitions for kernel modules using asus-armoury driver + * + * Copyright (c) 2024 Luke Jones + */ + +#ifndef _ASUS_ARMOURY_H_ +#define _ASUS_ARMOURY_H_ + +#include +#include +#include + +#define DRIVER_NAME "asus-armoury" + +/** + * armoury_attr_uint_store() - Send an uint to WMI method if within min/max. + * @kobj: Pointer to the driver object. + * @attr: Pointer to the attribute calling this function. + * @buf: The buffer to read from, this is parsed to `uint` type. + * @count: Required by sysfs attribute macros, pass in from the callee attr. + * @min: Minimum accepted value. Below this returns -EINVAL. + * @max: Maximum accepted value. Above this returns -EINVAL. + * @store_value: Pointer to where the parsed value should be stored. + * @wmi_dev: The WMI function ID to use. + * + * This function is intended to be generic so it can be called from any "_store" + * attribute which works only with integers. + * + * Integers to be sent to the WMI method is inclusive range checked and + * an error returned if out of range. + * + * If the value is valid and WMI is success then the sysfs attribute is notified + * and if asus_bios_requires_reboot() is true then reboot attribute + * is also notified. + * + * Returns: Either count, or an error. + */ +ssize_t armoury_attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count, u32 min, u32 max, + u32 *store_value, u32 wmi_dev); + +/** + * armoury_attr_uint_show() - Receive an uint from a WMI method. + * @kobj: Pointer to the driver object. + * @attr: Pointer to the attribute calling this function. + * @buf: The buffer to write to, as an `uint` type. + * @wmi_dev: The WMI function ID to use. + * + * This function is intended to be generic so it can be called from any "_show" + * attribute which works only with integers. + * + * Returns: Either count, or an error. + */ +ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf, u32 wmi_dev); + +#define __ASUS_ATTR_RO(_func, _name) \ + { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _func##_##_name##_show, \ + } + +#define __ASUS_ATTR_RO_AS(_name, _show) \ + { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _show, \ + } + +#define __ASUS_ATTR_RW(_func, _name) \ + __ATTR(_name, 0644, _func##_##_name##_show, _func##_##_name##_store) + +#define __WMI_STORE_INT(_attr, _min, _max, _wmi) \ + static ssize_t _attr##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ + { \ + return armoury_attr_uint_store(kobj, attr, buf, count, _min, \ + _max, NULL, _wmi); \ + } + +#define ASUS_WMI_SHOW_INT(_attr, _wmi) \ + static ssize_t _attr##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return armoury_attr_uint_show(kobj, attr, buf, _wmi); \ + } + +/* Create functions and attributes for use in other macros or on their own */ + +/* Shows a formatted static variable */ +#define __ATTR_SHOW_FMT(_prop, _attrname, _fmt, _val) \ + static ssize_t _attrname##_##_prop##_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + return sysfs_emit(buf, _fmt, _val); \ + } \ + static struct kobj_attribute attr_##_attrname##_##_prop = \ + __ASUS_ATTR_RO(_attrname, _prop) + +#define __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)\ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RO(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#define __ATTR_RW_INT_GROUP_ENUM(_attrname, _minv, _maxv, _wmi, _fsname,\ + _possible, _dispname) \ + __WMI_STORE_INT(_attrname##_current_value, _minv, _maxv, _wmi); \ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* Boolean style enumeration, base macro. Requires adding show/store */ +#define __ATTR_GROUP_ENUM(_attrname, _fsname, _possible, _dispname) \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#define ASUS_ATTR_GROUP_BOOL_RO(_attrname, _fsname, _wmi, _dispname) \ + __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, "0;1", _dispname) + + +#define ASUS_ATTR_GROUP_BOOL_RW(_attrname, _fsname, _wmi, _dispname) \ + __ATTR_RW_INT_GROUP_ENUM(_attrname, 0, 1, _wmi, _fsname, "0;1", _dispname) + +#define ASUS_ATTR_GROUP_ENUM_INT_RO(_attrname, _fsname, _wmi, _possible, _dispname) \ + __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname) + +/* + * Requires _current_value_show(), _current_value_show() + */ +#define ASUS_ATTR_GROUP_BOOL(_attrname, _fsname, _dispname) \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + __ATTR_GROUP_ENUM(_attrname, _fsname, "0;1", _dispname) + +/* + * Requires _current_value_show(), _current_value_show() + * and _possible_values_show() + */ +#define ASUS_ATTR_GROUP_ENUM(_attrname, _fsname, _dispname) \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RW(_attrname, current_value); \ + static struct kobj_attribute attr_##_attrname##_possible_values = \ + __ASUS_ATTR_RO(_attrname, possible_values); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, enum_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_possible_values.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +#endif /* _ASUS_ARMOURY_H_ */ diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index c3e90517ce0f..ff98267e5981 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -55,8 +57,6 @@ module_param(fnlock_default, bool, 0444); #define to_asus_wmi_driver(pdrv) \ (container_of((pdrv), struct asus_wmi_driver, platform_driver)) -#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" - #define NOTIFY_BRNUP_MIN 0x11 #define NOTIFY_BRNUP_MAX 0x1f #define NOTIFY_BRNDOWN_MIN 0x20 @@ -105,8 +105,6 @@ module_param(fnlock_default, bool, 0444); #define USB_INTEL_XUSB2PR 0xD0 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 -#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" - #define WMI_EVENT_MASK 0xFFFF #define FAN_CURVE_POINTS 8 @@ -561,8 +559,8 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval) * * Returns: * * %-ENODEV - method ID is unsupported. - * * %0 - successful and retval is filled. - * * %other - error from WMI call. + * * %0 - successful and retval is filled. + * * %other - error from WMI call. */ int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) { diff --git a/include/linux/platform_data/x86/asus-wmi-leds-ids.h b/include/linux/platform_data/x86/asus-wmi-leds-ids.h new file mode 100644 index 000000000000..034a039c4e37 --- /dev/null +++ b/include/linux/platform_data/x86/asus-wmi-leds-ids.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H +#define __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H + +#include +#include + +/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ +#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS) +static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { }, +}; +#endif + +#endif /* __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index dbd44d9fbb6f..8ea8925a0fc5 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -4,7 +4,9 @@ #include #include -#include + +#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" +#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" /* WMI Methods */ #define ASUS_WMI_METHODID_SPEC 0x43455053 /* BIOS SPECification */ @@ -191,44 +193,4 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, } #endif -/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ -static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GA403U"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GU605M"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "RC71L"), - }, - }, - { }, -}; - #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ -- cgit v1.2.3 From 628cb03b15f2a0f10534979b3ea9c8befe87c381 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:13 +0100 Subject: platform/x86: asus-armoury: add panel_hd_mode attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add panel_hd_mode to toggle the panel mode between single and high definition modes. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-4-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 6 +++++- include/linux/platform_data/x86/asus-wmi.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index 81b4972df818..f0cb973a487e 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -96,7 +96,8 @@ static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot); static bool asus_bios_requires_reboot(struct kobj_attribute *attr) { - return !strcmp(attr->attr.name, "gpu_mux_mode"); + return !strcmp(attr->attr.name, "gpu_mux_mode") || + !strcmp(attr->attr.name, "panel_hd_mode"); } /** @@ -607,6 +608,8 @@ ASUS_ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWER "Set MCU powersaving mode"); ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, "Set the panel refresh overdrive"); +ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD, + "Set the panel HD mode to UHD<0> or FHD<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); @@ -620,6 +623,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, + { &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD }, }; static int asus_fw_attr_add(void) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8ea8925a0fc5..3cc235b20be4 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -75,6 +75,7 @@ #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019 /* Misc */ +#define ASUS_WMI_DEVID_PANEL_HD 0x0005001C #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 #define ASUS_WMI_DEVID_CAMERA 0x00060013 #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 -- cgit v1.2.3 From 05d6f1cc2dc214c1491181be13f37d2a3a26f694 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 13 Oct 2025 11:12:02 +0200 Subject: compiler.h: remove ARCH_SEL() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its last user was removed in commit 8ea815399c3f ("compiler: remove __ADDRESSABLE_ASM{_STR,}() again"). Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton --- include/linux/compiler.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..a9a2f8aae821 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -269,12 +269,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_64BIT -#define ARCH_SEL(a,b) a -#else -#define ARCH_SEL(a,b) b -#endif - /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined -- cgit v1.2.3 From adc15829fb73e402903b7030729263b6ee4a7232 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 16 Oct 2025 19:58:31 +0530 Subject: crash: let architecture decide crash memory export to iomem_resource With the generic crashkernel reservation, the kernel emits the following warning on powerpc: WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000 REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f) MSR: 8000000002029033 CR: 84000840 XER: 20040010 CFAR: c00000000017eed0 IRQMASK: 0 GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001 GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000 GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808 GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950 GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710 NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180 LR [c00000000201de34] add_system_ram_resources+0xf4/0x180 Call Trace: add_system_ram_resources+0xf4/0x180 (unreliable) do_one_initcall+0x60/0x36c do_initcalls+0x120/0x220 kernel_init_freeable+0x23c/0x390 kernel_init+0x34/0x26c ret_from_kernel_user_thread+0x14/0x1c This warning occurs due to a conflict between crashkernel and System RAM iomem resources. The generic crashkernel reservation adds the crashkernel memory range to /proc/iomem during early initialization. Later, all memblock ranges are added to /proc/iomem as System RAM. If the crashkernel region overlaps with any memblock range, it causes a conflict while adding those memblock regions as iomem resources, triggering the above warning. The conflicting memblock regions are then omitted from /proc/iomem. For example, if the following crashkernel region is added to /proc/iomem: 20000000-11fffffff : Crash kernel then the following memblock regions System RAM regions fail to be inserted: 00000000-7fffffff : System RAM 80000000-257fffffff : System RAM Fix this by not adding the crashkernel memory to /proc/iomem on powerpc. Introduce an architecture hook to let each architecture decide whether to export the crashkernel region to /proc/iomem. For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM resource to prevent crashkernel conflict") Note: Before switching to the generic crashkernel reservation, powerpc never exported the crashkernel region to /proc/iomem. Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation"). Signed-off-by: Sourabh Jain Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/ Tested-by: Venkat Rao Bagalkote Cc: Baoquan he Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Cc: Ritesh Harjani (IBM) Cc: Vivek Goyal Cc: Dave Young Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++ include/linux/crash_reserve.h | 6 ++++++ kernel/crash_reserve.c | 3 +++ 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/arch/powerpc/include/asm/crash_reserve.h b/arch/powerpc/include/asm/crash_reserve.h index 6467ce29b1fa..d1b570ddbf98 100644 --- a/arch/powerpc/include/asm/crash_reserve.h +++ b/arch/powerpc/include/asm/crash_reserve.h @@ -5,4 +5,12 @@ /* crash kernel regions are Page size agliged */ #define CRASH_ALIGN PAGE_SIZE +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static inline bool arch_add_crash_res_to_iomem(void) +{ + return false; +} +#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem +#endif + #endif /* _ASM_POWERPC_CRASH_RESERVE_H */ diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 7b44b41d0a20..f0dc03d94ca2 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef arch_add_crash_res_to_iomem +static inline bool arch_add_crash_res_to_iomem(void) +{ + return true; +} +#endif #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) #endif diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); -- cgit v1.2.3 From 37ade54f386c829597f74b54bad335c12bd2a698 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 22 Oct 2025 10:28:04 +0200 Subject: taint/module: remove unnecessary taint_flag.module field The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the taint_flags table as per-module flags. While this can be trivially corrected, the issue can be avoided altogether by removing the taint_flag.module field. This is possible because, since commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") in 2016, the handling of module taint flags has been fully generic. Specifically, module_flags_taint() can print all flags, and the required output buffer size is properly defined in terms of TAINT_FLAGS_COUNT. The actual per-module flags are always those added to module.taints by calls to add_taint_module(). Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu Acked-by: Petr Mladek Reviewed-by: Randy Dunlap Cc: Aaron Tomlin Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/panic.h | 1 - kernel/module/main.c | 2 +- kernel/panic.c | 46 +++++++++++++++++++++------------------------- 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 6f972a66c13e..a00bc0937698 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -86,7 +86,6 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) struct taint_flag { char c_true; /* character printed when tainted */ char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ const char *desc; /* verbose description of the set taint flag */ }; diff --git a/kernel/module/main.c b/kernel/module/main.c index c66b26184936..6f219751df7e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -954,7 +954,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &taints)) + if (test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } diff --git a/kernel/panic.c b/kernel/panic.c index ec59cade1f83..ffceb6f13935 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -628,17 +628,13 @@ void panic(const char *fmt, ...) } EXPORT_SYMBOL(panic); -#define TAINT_FLAG(taint, _c_true, _c_false, _module) \ +#define TAINT_FLAG(taint, _c_true, _c_false) \ [ TAINT_##taint ] = { \ .c_true = _c_true, .c_false = _c_false, \ - .module = _module, \ .desc = #taint, \ } /* - * TAINT_FORCED_RMMOD could be a per-module flag but the module - * is being removed anyway. - * * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT, * please also modify tools/debugging/kernel-chktaint and * Documentation/admin-guide/tainted-kernels.rst, including its @@ -646,26 +642,26 @@ EXPORT_SYMBOL(panic); * /proc/sys/kernel/tainted. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { - TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G', true), - TAINT_FLAG(FORCED_MODULE, 'F', ' ', true), - TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' ', false), - TAINT_FLAG(FORCED_RMMOD, 'R', ' ', false), - TAINT_FLAG(MACHINE_CHECK, 'M', ' ', false), - TAINT_FLAG(BAD_PAGE, 'B', ' ', false), - TAINT_FLAG(USER, 'U', ' ', false), - TAINT_FLAG(DIE, 'D', ' ', false), - TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' ', false), - TAINT_FLAG(WARN, 'W', ' ', false), - TAINT_FLAG(CRAP, 'C', ' ', true), - TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' ', false), - TAINT_FLAG(OOT_MODULE, 'O', ' ', true), - TAINT_FLAG(UNSIGNED_MODULE, 'E', ' ', true), - TAINT_FLAG(SOFTLOCKUP, 'L', ' ', false), - TAINT_FLAG(LIVEPATCH, 'K', ' ', true), - TAINT_FLAG(AUX, 'X', ' ', true), - TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), - TAINT_FLAG(TEST, 'N', ' ', true), - TAINT_FLAG(FWCTL, 'J', ' ', true), + TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), + TAINT_FLAG(FORCED_MODULE, 'F', ' '), + TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '), + TAINT_FLAG(FORCED_RMMOD, 'R', ' '), + TAINT_FLAG(MACHINE_CHECK, 'M', ' '), + TAINT_FLAG(BAD_PAGE, 'B', ' '), + TAINT_FLAG(USER, 'U', ' '), + TAINT_FLAG(DIE, 'D', ' '), + TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '), + TAINT_FLAG(WARN, 'W', ' '), + TAINT_FLAG(CRAP, 'C', ' '), + TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '), + TAINT_FLAG(OOT_MODULE, 'O', ' '), + TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '), + TAINT_FLAG(SOFTLOCKUP, 'L', ' '), + TAINT_FLAG(LIVEPATCH, 'K', ' '), + TAINT_FLAG(AUX, 'X', ' '), + TAINT_FLAG(RANDSTRUCT, 'T', ' '), + TAINT_FLAG(TEST, 'N', ' '), + TAINT_FLAG(FWCTL, 'J', ' '), }; #undef TAINT_FLAG -- cgit v1.2.3 From d99dc586ca7c7729450af2ed39ca1483c0eb7b5c Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Thu, 23 Oct 2025 13:16:06 -0400 Subject: uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is enabled. This pollutes exported symbols namespace, and spreads RUST ifdefery in core files. It's better to declare a corresponding helper under the rust/helpers, similarly to how non-underscored copy_{from,to}_user() is handled. [yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice] Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Trevor Gross Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 2 -- lib/usercopy.c | 4 ++-- rust/helpers/uaccess.c | 12 ++++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..01cbd7dd0ba3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -152,8 +152,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. - * - * Rust code always uses the extern definition. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) diff --git a/lib/usercopy.c b/lib/usercopy.c index 7b17b83c8042..b00a3a957de6 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,7 +12,7 @@ /* out-of-line parts */ -#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_FROM_USER) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { return _inline_copy_from_user(to, from, n); @@ -20,7 +20,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n EXPORT_SYMBOL(_copy_from_user); #endif -#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) +#if !defined(INLINE_COPY_TO_USER) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { return _inline_copy_to_user(to, from, n); diff --git a/rust/helpers/uaccess.c b/rust/helpers/uaccess.c index f49076f813cd..4629b2d15529 100644 --- a/rust/helpers/uaccess.c +++ b/rust/helpers/uaccess.c @@ -13,3 +13,15 @@ unsigned long rust_helper_copy_to_user(void __user *to, const void *from, { return copy_to_user(to, from, n); } + +#ifdef INLINE_COPY_FROM_USER +unsigned long rust_helper__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return _inline_copy_from_user(to, from, n); +} + +unsigned long rust_helper__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return _inline_copy_to_user(to, from, n); +} +#endif -- cgit v1.2.3 From 6c2e6e2c1af1809d1d9cdbd50ac80f54f5995bdb Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 25 Oct 2025 16:00:03 +0800 Subject: dynamic_debug: add support for print stack In practical problem diagnosis, especially during the boot phase, it is often desirable to know the call sequence. However, currently, apart from adding print statements and recompiling the kernel, there seems to be no good alternative. If dynamic_debug supported printing the call stack, it would be very helpful for diagnosing issues. This patch add support '+d' for dump stack. Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com Signed-off-by: Ye Bin Cc: Jason Baron Cc: Jim Cromie Signed-off-by: Andrew Morton --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++-- include/linux/dynamic_debug.h | 17 ++++++++++++++--- lib/dynamic_debug.c | 1 + 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 7c036590cd07..095a63892257 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -223,12 +223,13 @@ The flags are:: f Include the function name s Include the source file name l Include line number + d Include call trace For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only the ``p`` flag has meaning, other flags are ignored. -Note the regexp ``^[-+=][fslmpt_]+$`` matches a flags specification. -To clear all flags at once, use ``=_`` or ``-fslmpt``. +Note the regexp ``^[-+=][fslmptd_]+$`` matches a flags specification. +To clear all flags at once, use ``=_`` or ``-fslmptd``. Debug messages during Boot Process diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index ff44ec346162..05743900a116 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -38,11 +38,12 @@ struct _ddebug { #define _DPRINTK_FLAGS_INCL_LINENO (1<<3) #define _DPRINTK_FLAGS_INCL_TID (1<<4) #define _DPRINTK_FLAGS_INCL_SOURCENAME (1<<5) +#define _DPRINTK_FLAGS_INCL_STACK (1<<6) #define _DPRINTK_FLAGS_INCL_ANY \ (_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\ _DPRINTK_FLAGS_INCL_LINENO | _DPRINTK_FLAGS_INCL_TID |\ - _DPRINTK_FLAGS_INCL_SOURCENAME) + _DPRINTK_FLAGS_INCL_SOURCENAME | _DPRINTK_FLAGS_INCL_STACK) #if defined DEBUG #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT @@ -160,6 +161,12 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); +#define __dynamic_dump_stack(desc) \ +{ \ + if (desc.flags & _DPRINTK_FLAGS_INCL_STACK) \ + dump_stack(); \ +} + #define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt) \ static struct _ddebug __aligned(8) \ __section("__dyndbg") name = { \ @@ -220,8 +227,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, */ #define __dynamic_func_call_cls(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(&id, ##__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call(id, fmt, func, ...) \ __dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt, \ @@ -229,8 +238,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call_no_desc(id, fmt, func, ...) \ __dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT, \ diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 5a007952f7f2..7d7892e57a01 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -95,6 +95,7 @@ static const struct { unsigned flag:8; char opt_char; } opt_array[] = { { _DPRINTK_FLAGS_INCL_SOURCENAME, 's' }, { _DPRINTK_FLAGS_INCL_LINENO, 'l' }, { _DPRINTK_FLAGS_INCL_TID, 't' }, + { _DPRINTK_FLAGS_INCL_STACK, 'd' }, { _DPRINTK_FLAGS_NONE, '_' }, }; -- cgit v1.2.3 From a0b8c6af29a4be3ca2ff9a95cf71e54db5d73e65 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 24 Oct 2025 21:51:20 +0100 Subject: lib/xxhash: remove more unused xxh functions xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the xxh32_state struct is also unused. xxh64_copy_state() is also unused. Remove them all. (Also fixes a comment above the xxh64_state that referred to it as xxh32_state). Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Suggested-by: Christoph Hellwig Reviewed-by: Kuan-Wei Chiu Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/xxhash.h | 46 +--------------------------------------------- lib/xxhash.c | 29 ----------------------------- 2 files changed, 1 insertion(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 27f57eca8cb1..587122e2c29c 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -141,21 +141,7 @@ static inline unsigned long xxhash(const void *input, size_t length, */ /** - * struct xxh32_state - private xxh32 state, do not use members directly - */ -struct xxh32_state { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; -}; - -/** - * struct xxh32_state - private xxh64 state, do not use members directly + * struct xxh64_state - private xxh64 state, do not use members directly */ struct xxh64_state { uint64_t total_len; @@ -167,16 +153,6 @@ struct xxh64_state { uint32_t memsize; }; -/** - * xxh32_reset() - reset the xxh32 state to start a new hashing operation - * - * @state: The xxh32 state to reset. - * @seed: Initialize the hash state with this seed. - * - * Call this function on any xxh32_state to prepare for a new hashing operation. - */ -void xxh32_reset(struct xxh32_state *state, uint32_t seed); - /** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * @@ -210,24 +186,4 @@ int xxh64_update(struct xxh64_state *state, const void *input, size_t length); */ uint64_t xxh64_digest(const struct xxh64_state *state); -/*-************************** - * Utils - ***************************/ - -/** - * xxh32_copy_state() - copy the source state into the destination state - * - * @src: The source xxh32 state. - * @dst: The destination xxh32 state. - */ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src); - -/** - * xxh64_copy_state() - copy the source state into the destination state - * - * @src: The source xxh64 state. - * @dst: The destination xxh64 state. - */ -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src); - #endif /* XXHASH_H */ diff --git a/lib/xxhash.c b/lib/xxhash.c index cf629766f376..4125b3e3cf7f 100644 --- a/lib/xxhash.c +++ b/lib/xxhash.c @@ -73,21 +73,6 @@ static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; -/*-************************** - * Utils - ***************************/ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh32_copy_state); - -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) -{ - memcpy(dst, src, sizeof(*dst)); -} -EXPORT_SYMBOL(xxh64_copy_state); - /*-*************************** * Simple Hash Functions ****************************/ @@ -239,20 +224,6 @@ EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ -void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) -{ - /* use a local state for memcpy() to avoid strict-aliasing warnings */ - struct xxh32_state state; - - memset(&state, 0, sizeof(state)); - state.v1 = seed + PRIME32_1 + PRIME32_2; - state.v2 = seed + PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - PRIME32_1; - memcpy(statePtr, &state, sizeof(state)); -} -EXPORT_SYMBOL(xxh32_reset); - void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ -- cgit v1.2.3 From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 9 ++++++--- drivers/vfio/vfio_main.c | 7 +++++++ include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..1dc350003f07 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -996,9 +996,11 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg) { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; @@ -1132,6 +1134,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1458,7 +1461,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); + return vfio_pci_ioctl_get_region_info(core_vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..a390163ce706 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1296,7 +1296,14 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + if (!device->ops->get_region_info) + goto ioctl_fallback; + ret = device->ops->get_region_info(device, uptr); + break; + default: +ioctl_fallback: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; else diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..be5fcf8432e8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..160bc2e31ece 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 56 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/vfio.h | 4 ++++ 2 files changed, 56 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f056e82ba350..48d034aede46 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1259,6 +1259,57 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + if (device->ops->get_region_info_caps) { + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, + caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)) { + ret = -EFAULT; + goto out_free; + } + } else if (device->ops->get_region_info) { + ret = device->ops->get_region_info(device, arg); + if (ret) + return ret; + } else { + return -EINVAL; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1297,10 +1348,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, break; case VFIO_DEVICE_GET_REGION_INFO: - if (unlikely(!device->ops->get_region_info)) - ret = -EINVAL; - else - ret = device->ops->get_region_info(device, uptr); + ret = vfio_get_region_info(device, uptr); break; default: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8..6311ddc83770 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); -- cgit v1.2.3 From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 30 +++---- drivers/vfio/pci/mlx5/main.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 51 +++--------- drivers/vfio/pci/pds/vfio_dev.c | 2 +- drivers/vfio/pci/qat/main.c | 2 +- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 103 ++++++++++--------------- drivers/vfio/pci/virtio/common.h | 3 +- drivers/vfio/pci/virtio/legacy_io.c | 26 ++----- drivers/vfio/pci/virtio/main.c | 6 +- include/linux/vfio_pci_core.h | 3 +- 11 files changed, 80 insertions(+), 150 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 1fd3a2075ee4..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1386,32 +1386,22 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, } static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - if (info.index != VFIO_PCI_BAR2_REGION_INDEX) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + info->size = hisi_acc_get_resource_len(vdev, info->index); - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - - info.size = hisi_acc_get_resource_len(vdev, info.index); - - info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1610,7 +1600,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = hisi_acc_vfio_ioctl_get_region, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, @@ -1631,7 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b7f941f8047e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,7 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index cab743a30dc3..5a6f77d5c81e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -205,34 +205,25 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, return 0; } -static int -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_ioctl_get_region_info(core_vdev, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -251,40 +242,22 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, @@ -686,7 +659,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, - .get_region_info = nvgrace_gpu_ioctl_get_region_info, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -707,7 +680,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 1946bc75d99b..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,7 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 8452d9c1d11d..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,7 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2d9122efc10b..a3e49d42c771 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,7 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f21d9026068c..57c0766fb9f8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -997,43 +997,35 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, } int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1045,9 +1037,9 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1057,16 +1049,17 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1075,10 +1068,10 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1087,52 +1080,34 @@ int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index a10f2d92cb62..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -110,7 +110,8 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index d735d5c4bd77..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,29 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_region_info info = {}; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_ioctl_get_region_info(core_vdev, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d68096bc5252..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,7 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -110,7 +110,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = virtiovf_pci_ioctl_get_region_info, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -132,7 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, - .get_region_info = vfio_pci_ioctl_get_region_info, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 160bc2e31ece..e74f94c17fbe 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 50 +++++++++++++++++++++--------------------------- include/linux/vfio.h | 2 -- 2 files changed, 22 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 48d034aede46..b8fe1a75e48a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1267,42 +1267,36 @@ static long vfio_get_region_info(struct vfio_device *device, struct vfio_info_cap caps = {}; int ret; + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (device->ops->get_region_info_caps) { - ret = device->ops->get_region_info_caps(device, &info, &caps); - if (ret) - goto out_free; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, - caps.size)) { - ret = -EFAULT; - goto out_free; - } - info.cap_offset = sizeof(info); + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; } + info.cap_offset = sizeof(info); } + } - if (copy_to_user(arg, &info, minsz)) { - ret = -EFAULT; - goto out_free; - } - } else if (device->ops->get_region_info) { - ret = device->ops->get_region_info(device, arg); - if (ret) - return ret; - } else { - return -EINVAL; + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; } out_free: diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc83770..8e1ddb48b9b5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); -- cgit v1.2.3 From 044b9f1a7f4f3d41563007d0762c83a7d7505ac0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 12 Nov 2025 08:46:14 +0100 Subject: PCI/PTM: Enable only if device advertises relevant role We have a Switch Upstream Port (2b:00.0) that has a PTM Capability, but doesn't advertise support for any PTM roles: Capabilities: [220 v1] Precision Time Measurement PTMCap: Requester- Responder- Root- Linux enables PTM without looking into what roles it actually supports, and apparently the Port immediately sends PTM Requests even though it doesn't support the PTM Requester role. The messages include an invalid bus number, so the Root Port detects an ACS Violation (see the PCIe r7.0, sec 6.12.1.1, implementation note): pci 0000:2b:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port pci 0000:2b:00.0: PTM enabled, 4ns granularity pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:e44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 The TLP Header shows a 4 DW header, no data (001b) Msg with Local routing (1 0100b) with Requester ID 0x0000 and PTM Request code (0x52). Fix this by enabling PTM only if the following conditions are true (see sec 6.21.1 figure 6-21): - Endpoint must advertise PTM Requester Capable - Switch Upstream Port must advertise PTM Responder Capable - Root Port must advertise PTM Root Capable Signed-off-by: Mika Westerberg [bhelgaas: commit log, comments] Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251112074614.1440266-1-mika.westerberg@linux.intel.com --- drivers/pci/pcie/ptm.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 2 ++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 65e4b008be00..ed0f9691e7d1 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -81,6 +81,11 @@ void pci_ptm_init(struct pci_dev *dev) dev->ptm_granularity = 0; } + if (cap & PCI_PTM_CAP_RES) + dev->ptm_responder = 1; + if (cap & PCI_PTM_CAP_REQ) + dev->ptm_requester = 1; + if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) pci_enable_ptm(dev, NULL); @@ -144,6 +149,24 @@ static int __pci_enable_ptm(struct pci_dev *dev) return -EINVAL; } + switch (pci_pcie_type(dev)) { + case PCI_EXP_TYPE_ROOT_PORT: + if (!dev->ptm_root) + return -EINVAL; + break; + case PCI_EXP_TYPE_UPSTREAM: + if (!dev->ptm_responder) + return -EINVAL; + break; + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + if (!dev->ptm_requester) + return -EINVAL; + break; + default: + return -EINVAL; + } + pci_read_config_dword(dev, ptm + PCI_PTM_CTRL, &ctrl); ctrl |= PCI_PTM_CTRL_ENABLE; diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..d5018cb5c331 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -500,6 +500,8 @@ struct pci_dev { #ifdef CONFIG_PCIE_PTM u16 ptm_cap; /* PTM Capability */ unsigned int ptm_root:1; + unsigned int ptm_responder:1; + unsigned int ptm_requester:1; unsigned int ptm_enabled:1; u8 ptm_granularity; #endif -- cgit v1.2.3 From 4f49088c162579a4ed049c555fe0cd188fd928c4 Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Wed, 8 Oct 2025 17:09:05 +0800 Subject: firmware: stratix10-svc: Add definition for voltage and temperature sensor Add entry in Stratix 10 Service Layer to support temperature and voltage sensor. Signed-off-by: Khairul Anuar Romli Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 21 +++++++++++-- include/linux/firmware/intel/stratix10-smc.h | 34 ++++++++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 8 ++++- 3 files changed, 60 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index e3f990d888d7..5a32c1054bee 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -34,7 +34,7 @@ * timeout is set to 30 seconds (30 * 1000) at Intel Stratix10 SoC. */ #define SVC_NUM_DATA_IN_FIFO 32 -#define SVC_NUM_CHANNEL 3 +#define SVC_NUM_CHANNEL 4 #define FPGA_CONFIG_DATA_CLAIM_TIMEOUT_MS 200 #define FPGA_CONFIG_STATUS_TIMEOUT_SEC 30 #define BYTE_TO_WORD_SIZE 4 @@ -341,6 +341,8 @@ static void svc_thread_recv_status_ok(struct stratix10_svc_data *p_data, case COMMAND_RSU_MAX_RETRY: case COMMAND_RSU_DCMF_STATUS: case COMMAND_FIRMWARE_VERSION: + case COMMAND_HWMON_READTEMP: + case COMMAND_HWMON_READVOLT: cb_data->status = BIT(SVC_STATUS_OK); cb_data->kaddr1 = &res.a1; break; @@ -525,7 +527,17 @@ static int svc_normal_to_secure_thread(void *data) a1 = (unsigned long)pdata->paddr; a2 = 0; break; - + /* for HWMON */ + case COMMAND_HWMON_READTEMP: + a0 = INTEL_SIP_SMC_HWMON_READTEMP; + a1 = pdata->arg[0]; + a2 = 0; + break; + case COMMAND_HWMON_READVOLT: + a0 = INTEL_SIP_SMC_HWMON_READVOLT; + a1 = pdata->arg[0]; + a2 = 0; + break; /* for polling */ case COMMAND_POLL_SERVICE_STATUS: a0 = INTEL_SIP_SMC_SERVICE_COMPLETED; @@ -1197,6 +1209,11 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) chans[2].name = SVC_CLIENT_FCS; spin_lock_init(&chans[2].lock); + chans[3].scl = NULL; + chans[3].ctrl = controller; + chans[3].name = SVC_CLIENT_HWMON; + spin_lock_init(&chans[3].lock); + list_add_tail(&controller->node, &svc_ctrl); platform_set_drvdata(pdev, controller); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index ee80ca4bb0d0..7306dd243b2a 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -620,4 +620,38 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_FCS_GET_PROVISION_DATA \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA) +/** + * Request INTEL_SIP_SMC_HWMON_READTEMP + * Sync call to request temperature + * + * Call register usage: + * a0 Temperature Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Temperature Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READTEMP 32 +#define INTEL_SIP_SMC_HWMON_READTEMP \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READTEMP) + +/** + * Request INTEL_SIP_SMC_HWMON_READVOLT + * Sync call to request voltage + * + * Call register usage: + * a0 Voltage Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Voltage Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READVOLT 33 +#define INTEL_SIP_SMC_HWMON_READVOLT \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) + #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 60ed82112680..520004a5f15d 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -11,12 +11,14 @@ * * fpga: for FPGA configuration * rsu: for remote status update + * hwmon: for hardware monitoring (voltage and temperature) */ #define SVC_CLIENT_FPGA "fpga" #define SVC_CLIENT_RSU "rsu" #define SVC_CLIENT_FCS "fcs" +#define SVC_CLIENT_HWMON "hwmon" -/* +/** * Status of the sent command, in bit number * * SVC_STATUS_OK: @@ -70,6 +72,7 @@ #define SVC_RSU_REQUEST_TIMEOUT_MS 300 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 struct stratix10_svc_chan; @@ -171,6 +174,9 @@ enum stratix10_svc_command_code { COMMAND_MBOX_SEND_CMD = 100, /* Non-mailbox SMC Call */ COMMAND_SMC_SVC_VERSION = 200, + /* for HWMON */ + COMMAND_HWMON_READTEMP, + COMMAND_HWMON_READVOLT }; /** -- cgit v1.2.3 From bcb9f4f0706147afc62c48533276a18fe7b8f354 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:41 +0800 Subject: firmware: stratix10-svc: Add support for async communication Introduce support for asynchronous communication with the Stratix10 service channel. Define new structures to enable asynchronous messaging with the Secure Device Manager (SDM). Add and remove asynchronous support for existing channels. Implement initialization and cleanup routines for the asynchronous framework. Enable sending and polling of messages to the SDM asynchronously. The new public functions added are: - stratix10_svc_add_async_client: Adds a client to the service channel. - stratix10_svc_remove_async_client: Removes an asynchronous client from the service channel. - stratix10_svc_async_send: Sends an asynchronous message to the SDM mailbox in EL3 secure firmware. - stratix10_svc_async_poll: Polls the status of an asynchronous service request in EL3 secure firmware. - stratix10_svc_async_done: Marks an asynchronous transaction as complete and frees up the resources. These changes enhance the functionality of the Stratix10 service channel by allowing for more efficient and flexible communication with the firmware. Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 656 ++++++++++++++++++++- include/linux/firmware/intel/stratix10-smc.h | 25 + .../linux/firmware/intel/stratix10-svc-client.h | 88 +++ 3 files changed, 765 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index 9372a17d89b7..14bfa36a58ed 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -4,9 +4,12 @@ * Copyright (C) 2025, Altera Corporation */ +#include #include #include #include +#include +#include #include #include #include @@ -44,6 +47,49 @@ #define STRATIX10_RSU "stratix10-rsu" #define INTEL_FCS "intel-fcs" +/* Maximum number of SDM client IDs. */ +#define MAX_SDM_CLIENT_IDS 16 +/* Client ID for SIP Service Version 1. */ +#define SIP_SVC_V1_CLIENT_ID 0x1 +/* Maximum number of SDM job IDs. */ +#define MAX_SDM_JOB_IDS 16 +/* Number of bits used for asynchronous transaction hashing. */ +#define ASYNC_TRX_HASH_BITS 3 +/** + * Total number of transaction IDs, which is a combination of + * client ID and job ID. + */ +#define TOTAL_TRANSACTION_IDS \ + (MAX_SDM_CLIENT_IDS * MAX_SDM_JOB_IDS) + +/* Minimum major version of the ATF for Asynchronous transactions. */ +#define ASYNC_ATF_MINIMUM_MAJOR_VERSION 0x3 +/* Minimum minor version of the ATF for Asynchronous transactions.*/ +#define ASYNC_ATF_MINIMUM_MINOR_VERSION 0x0 + +/* Job ID field in the transaction ID */ +#define STRATIX10_JOB_FIELD GENMASK(3, 0) +/* Client ID field in the transaction ID */ +#define STRATIX10_CLIENT_FIELD GENMASK(7, 4) +/* Transaction ID mask for Stratix10 service layer */ +#define STRATIX10_TRANS_ID_FIELD GENMASK(7, 0) + +/* Macro to extract the job ID from a transaction ID. */ +#define STRATIX10_GET_JOBID(transaction_id) \ + (FIELD_GET(STRATIX10_JOB_FIELD, transaction_id)) +/* Macro to set the job ID in a transaction ID. */ +#define STRATIX10_SET_JOBID(jobid) \ + (FIELD_PREP(STRATIX10_JOB_FIELD, jobid)) +/* Macro to set the client ID in a transaction ID. */ +#define STRATIX10_SET_CLIENTID(clientid) \ + (FIELD_PREP(STRATIX10_CLIENT_FIELD, clientid)) +/* Macro to set a transaction ID using a client ID and a job ID. */ +#define STRATIX10_SET_TRANSACTIONID(clientid, jobid) \ + (STRATIX10_SET_CLIENTID(clientid) | STRATIX10_SET_JOBID(jobid)) +/* Macro to set a transaction ID for SIP SMC Async transactions */ +#define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \ + (FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id)) + typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, @@ -64,7 +110,7 @@ struct stratix10_svc { * @sync_complete: state for a completion * @addr: physical address of shared memory block * @size: size of shared memory block - * @invoke_fn: function to issue secure monitor or hypervisor call + * @invoke_fn: service clients to handle secure monitor or hypervisor calls * * This struct is used to save physical address and size of shared memory * block. The shared memory blocked is allocated by secure monitor software @@ -122,6 +168,74 @@ struct stratix10_svc_data { u64 arg[3]; }; +/** + * struct stratix10_svc_async_handler - Asynchronous handler for Stratix10 + * service layer + * @transaction_id: Unique identifier for the transaction + * @achan: Pointer to the asynchronous channel structure + * @cb_arg: Argument to be passed to the callback function + * @cb: Callback function to be called upon completion + * @msg: Pointer to the client message structure + * @next: Node in the hash list + * @res: Response structure to store result from the secure firmware + * + * This structure is used to handle asynchronous transactions in the + * Stratix10 service layer. It maintains the necessary information + * for processing and completing asynchronous requests. + */ + +struct stratix10_svc_async_handler { + u8 transaction_id; + struct stratix10_async_chan *achan; + void *cb_arg; + async_callback_t cb; + struct stratix10_svc_client_msg *msg; + struct hlist_node next; + struct arm_smccc_1_2_regs res; +}; + +/** + * struct stratix10_async_chan - Structure representing an asynchronous channel + * @async_client_id: Unique client identifier for the asynchronous operation + * @job_id_pool: Pointer to the job ID pool associated with this channel + */ + +struct stratix10_async_chan { + unsigned long async_client_id; + struct ida job_id_pool; +}; + +/** + * struct stratix10_async_ctrl - Control structure for Stratix10 + * asynchronous operations + * @initialized: Flag indicating whether the control structure has + * been initialized + * @invoke_fn: Function pointer for invoking Stratix10 service calls + * to EL3 secure firmware + * @async_id_pool: Pointer to the ID pool used for asynchronous + * operations + * @common_achan_refcount: Atomic reference count for the common + * asynchronous channel usage + * @common_async_chan: Pointer to the common asynchronous channel + * structure + * @trx_list_lock: Spinlock for protecting the transaction list + * operations + * @trx_list: Hash table for managing asynchronous transactions + */ + +struct stratix10_async_ctrl { + bool initialized; + void (*invoke_fn)(struct stratix10_async_ctrl *actrl, + const struct arm_smccc_1_2_regs *args, + struct arm_smccc_1_2_regs *res); + struct ida async_id_pool; + atomic_t common_achan_refcount; + struct stratix10_async_chan *common_async_chan; + /* spinlock to protect trx_list hash table */ + spinlock_t trx_list_lock; + DECLARE_HASHTABLE(trx_list, ASYNC_TRX_HASH_BITS); +}; + /** * struct stratix10_svc_controller - service controller * @dev: device @@ -135,6 +249,7 @@ struct stratix10_svc_data { * @complete_status: state for completion * @svc_fifo_lock: protect access to service message data queue * @invoke_fn: function to issue secure monitor call or hypervisor call + * @actrl: async control structure * * This struct is used to create communication channels for service clients, to * handle secure monitor or hypervisor call. @@ -151,6 +266,7 @@ struct stratix10_svc_controller { struct completion complete_status; spinlock_t svc_fifo_lock; svc_invoke_fn *invoke_fn; + struct stratix10_async_ctrl actrl; }; /** @@ -159,15 +275,17 @@ struct stratix10_svc_controller { * @scl: pointer to service client which owns the channel * @name: service client name associated with the channel * @lock: protect access to the channel + * @async_chan: reference to asynchronous channel object for this channel * - * This struct is used by service client to communicate with service layer, each - * service client has its own channel created by service controller. + * This struct is used by service client to communicate with service layer. + * Each service client has its own channel created by service controller. */ struct stratix10_svc_chan { struct stratix10_svc_controller *ctrl; struct stratix10_svc_client *scl; char *name; spinlock_t lock; + struct stratix10_async_chan *async_chan; }; static LIST_HEAD(svc_ctrl); @@ -942,6 +1060,525 @@ struct stratix10_svc_chan *stratix10_svc_request_channel_byname( } EXPORT_SYMBOL_GPL(stratix10_svc_request_channel_byname); +/** + * stratix10_svc_add_async_client - Add an asynchronous client to the + * Stratix10 service channel. + * @chan: Pointer to the Stratix10 service channel structure. + * @use_unique_clientid: Boolean flag indicating whether to use a + * unique client ID. + * + * This function adds an asynchronous client to the specified + * Stratix10 service channel. If the `use_unique_clientid` flag is + * set to true, a unique client ID is allocated for the asynchronous + * channel. Otherwise, a common asynchronous channel is used. + * + * Return: 0 on success, or a negative error code on failure: + * -EINVAL if the channel is NULL or the async controller is + * not initialized. + * -EALREADY if the async channel is already allocated. + * -ENOMEM if memory allocation fails. + * Other negative values if ID allocation fails. + */ +int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, + bool use_unique_clientid) +{ + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + int ret = 0; + + if (!chan) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + + if (!actrl->initialized) { + dev_err(ctrl->dev, "Async controller not initialized\n"); + return -EINVAL; + } + + if (chan->async_chan) { + dev_err(ctrl->dev, "async channel already allocated\n"); + return -EALREADY; + } + + if (use_unique_clientid && + atomic_read(&actrl->common_achan_refcount) > 0) { + chan->async_chan = actrl->common_async_chan; + atomic_inc(&actrl->common_achan_refcount); + return 0; + } + + achan = kzalloc(sizeof(*achan), GFP_KERNEL); + if (!achan) + return -ENOMEM; + + ida_init(&achan->job_id_pool); + + ret = ida_alloc_max(&actrl->async_id_pool, MAX_SDM_CLIENT_IDS, + GFP_KERNEL); + if (ret < 0) { + dev_err(ctrl->dev, + "Failed to allocate async client id\n"); + ida_destroy(&achan->job_id_pool); + kfree(achan); + return ret; + } + + achan->async_client_id = ret; + chan->async_chan = achan; + + if (use_unique_clientid && + atomic_read(&actrl->common_achan_refcount) == 0) { + actrl->common_async_chan = achan; + atomic_inc(&actrl->common_achan_refcount); + } + + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_add_async_client); + +/** + * stratix10_svc_remove_async_client - Remove an asynchronous client + * from the Stratix10 service + * channel. + * @chan: Pointer to the Stratix10 service channel structure. + * + * This function removes an asynchronous client associated with the + * given service channel. It checks if the channel and the + * asynchronous channel are valid, and then proceeds to decrement + * the reference count for the common asynchronous channel if + * applicable. If the reference count reaches zero, it destroys the + * job ID pool and deallocates the asynchronous client ID. For + * non-common asynchronous channels, it directly destroys the job ID + * pool, deallocates the asynchronous client ID, and frees the + * memory allocated for the asynchronous channel. + * + * Return: 0 on success, -EINVAL if the channel or asynchronous + * channel is invalid. + */ +int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan) +{ + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + + if (!chan) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + achan = chan->async_chan; + + if (!achan) { + dev_err(ctrl->dev, "async channel not allocated\n"); + return -EINVAL; + } + + if (achan == actrl->common_async_chan) { + atomic_dec(&actrl->common_achan_refcount); + if (atomic_read(&actrl->common_achan_refcount) == 0) { + ida_destroy(&achan->job_id_pool); + ida_free(&actrl->async_id_pool, + achan->async_client_id); + kfree(achan); + actrl->common_async_chan = NULL; + } + } else { + ida_destroy(&achan->job_id_pool); + ida_free(&actrl->async_id_pool, achan->async_client_id); + kfree(achan); + } + chan->async_chan = NULL; + + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_remove_async_client); + +/** + * stratix10_svc_async_send - Send an asynchronous message to the + * Stratix10 service + * @chan: Pointer to the service channel structure + * @msg: Pointer to the message to be sent + * @handler: Pointer to the handler for the asynchronous message + * used by caller for later reference. + * @cb: Callback function to be called upon completion + * @cb_arg: Argument to be passed to the callback function + * + * This function sends an asynchronous message to the SDM mailbox in + * EL3 secure firmware. It performs various checks and setups, + * including allocating a job ID, setting up the transaction ID and + * packaging it to El3 firmware. The function handles different + * commands by setting up the appropriate arguments for the SMC call. + * If the SMC call is successful, the handler is set up and the + * function returns 0. If the SMC call fails, appropriate error + * handling is performed along with cleanup of resources. + * + * Return: 0 on success, -EINVAL for invalid argument, -ENOMEM if + * memory is not available, -EAGAIN if EL3 firmware is busy, -EBADF + * if the message is rejected by EL3 firmware and -EIO on other + * errors from EL3 firmware. + */ +int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, + void **handler, async_callback_t cb, void *cb_arg) +{ + struct arm_smccc_1_2_regs args = { 0 }, res = { 0 }; + struct stratix10_svc_async_handler *handle = NULL; + struct stratix10_svc_client_msg *p_msg = + (struct stratix10_svc_client_msg *)msg; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + int ret = 0; + + if (!chan || !msg || !handler) + return -EINVAL; + + achan = chan->async_chan; + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + + if (!actrl->initialized) { + dev_err(ctrl->dev, "Async controller not initialized\n"); + return -EINVAL; + } + + if (!achan) { + dev_err(ctrl->dev, "Async channel not allocated\n"); + return -EINVAL; + } + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + ret = ida_alloc_max(&achan->job_id_pool, MAX_SDM_JOB_IDS, + GFP_KERNEL); + if (ret < 0) { + dev_err(ctrl->dev, "Failed to allocate job id\n"); + kfree(handle); + return -ENOMEM; + } + + handle->transaction_id = + STRATIX10_SET_TRANSACTIONID(achan->async_client_id, ret); + handle->cb = cb; + handle->msg = p_msg; + handle->cb_arg = cb_arg; + handle->achan = achan; + + /*set the transaction jobid in args.a1*/ + args.a1 = + STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); + + switch (p_msg->command) { + default: + dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command); + ret = -EINVAL; + goto deallocate_id; + } + + /** + * There is a chance that during the execution of async_send() + * in one core, an interrupt might be received in another core; + * to mitigate this we are adding the handle to the DB and then + * send the smc call. If the smc call is rejected or busy then + * we will deallocate the handle for the client to retry again. + */ + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_add(actrl->trx_list, &handle->next, + handle->transaction_id); + } + + actrl->invoke_fn(actrl, &args, &res); + + switch (res.a0) { + case INTEL_SIP_SMC_STATUS_OK: + dev_dbg(ctrl->dev, + "Async message sent with transaction_id 0x%02x\n", + handle->transaction_id); + *handler = handle; + return 0; + case INTEL_SIP_SMC_STATUS_BUSY: + dev_warn(ctrl->dev, "Mailbox is busy, try after some time\n"); + ret = -EAGAIN; + break; + case INTEL_SIP_SMC_STATUS_REJECTED: + dev_err(ctrl->dev, "Async message rejected\n"); + ret = -EBADF; + break; + default: + dev_err(ctrl->dev, + "Failed to send async message ,got status as %ld\n", + res.a0); + ret = -EIO; + } + + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_del(&handle->next); + } + +deallocate_id: + ida_free(&achan->job_id_pool, + STRATIX10_GET_JOBID(handle->transaction_id)); + kfree(handle); + return ret; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_send); +/** + * stratix10_svc_async_poll - Polls the status of an asynchronous + * transaction. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being polled. + * @data: Pointer to the callback data structure. + * + * This function polls the status of an asynchronous transaction + * identified by the given transaction handle. It ensures that the + * necessary structures are initialized and valid before proceeding + * with the poll operation. The function sets up the necessary + * arguments for the SMC call, invokes the call, and prepares the + * response data if the call is successful. If the call fails, the + * function returns the error mapped to the SVC status error. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid, + * -EAGAIN if the transaction is still in progress, + * -EPERM if the command is invalid, or other negative + * error codes on failure. + */ +int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, + void *tx_handle, + struct stratix10_svc_cb_data *data) +{ + struct stratix10_svc_async_handler *handle; + struct arm_smccc_1_2_regs args = { 0 }; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_ctrl *actrl; + struct stratix10_async_chan *achan; + + if (!chan || !tx_handle || !data) + return -EINVAL; + + ctrl = chan->ctrl; + actrl = &ctrl->actrl; + achan = chan->async_chan; + + if (!achan) { + dev_err(ctrl->dev, "Async channel not allocated\n"); + return -EINVAL; + } + + handle = (struct stratix10_svc_async_handler *)tx_handle; + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + if (!hash_hashed(&handle->next)) { + dev_err(ctrl->dev, "Invalid transaction handler"); + return -EINVAL; + } + } + + args.a0 = INTEL_SIP_SMC_ASYNC_POLL; + args.a1 = + STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); + + actrl->invoke_fn(actrl, &args, &handle->res); + + /*clear data for response*/ + memset(data, 0, sizeof(*data)); + + if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) { + return 0; + } else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) { + dev_dbg(ctrl->dev, "async message is still in progress\n"); + return -EAGAIN; + } + + dev_err(ctrl->dev, + "Failed to poll async message ,got status as %ld\n", + handle->res.a0); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_poll); + +/** + * stratix10_svc_async_done - Completes an asynchronous transaction. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being completed. + * + * This function completes an asynchronous transaction identified by + * the given transaction handle. It ensures that the necessary + * structures are initialized and valid before proceeding with the + * completion operation. The function deallocates the transaction ID, + * frees the memory allocated for the handler, and removes the handler + * from the transaction list. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid, + * or other negative error codes on failure. + */ +int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle) +{ + struct stratix10_svc_async_handler *handle; + struct stratix10_svc_controller *ctrl; + struct stratix10_async_chan *achan; + struct stratix10_async_ctrl *actrl; + + if (!chan || !tx_handle) + return -EINVAL; + + ctrl = chan->ctrl; + achan = chan->async_chan; + actrl = &ctrl->actrl; + + if (!achan) { + dev_err(ctrl->dev, "async channel not allocated\n"); + return -EINVAL; + } + + handle = (struct stratix10_svc_async_handler *)tx_handle; + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + if (!hash_hashed(&handle->next)) { + dev_err(ctrl->dev, "Invalid transaction handle"); + return -EINVAL; + } + hash_del(&handle->next); + } + ida_free(&achan->job_id_pool, + STRATIX10_GET_JOBID(handle->transaction_id)); + kfree(handle); + return 0; +} +EXPORT_SYMBOL_GPL(stratix10_svc_async_done); + +static inline void stratix10_smc_1_2(struct stratix10_async_ctrl *actrl, + const struct arm_smccc_1_2_regs *args, + struct arm_smccc_1_2_regs *res) +{ + arm_smccc_1_2_smc(args, res); +} + +/** + * stratix10_svc_async_init - Initialize the Stratix10 service + * controller for asynchronous operations. + * @controller: Pointer to the Stratix10 service controller structure. + * + * This function initializes the asynchronous service controller by + * setting up the necessary data structures and initializing the + * transaction list. + * + * Return: 0 on success, -EINVAL if the controller is NULL or already + * initialized, -ENOMEM if memory allocation fails, + * -EADDRINUSE if the client ID is already reserved, or other + * negative error codes on failure. + */ +static int stratix10_svc_async_init(struct stratix10_svc_controller *controller) +{ + struct stratix10_async_ctrl *actrl; + struct arm_smccc_res res; + struct device *dev; + int ret; + + if (!controller) + return -EINVAL; + + actrl = &controller->actrl; + + if (actrl->initialized) + return -EINVAL; + + dev = controller->dev; + + controller->invoke_fn(INTEL_SIP_SMC_SVC_VERSION, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != INTEL_SIP_SMC_STATUS_OK || + !(res.a1 > ASYNC_ATF_MINIMUM_MAJOR_VERSION || + (res.a1 == ASYNC_ATF_MINIMUM_MAJOR_VERSION && + res.a2 >= ASYNC_ATF_MINIMUM_MINOR_VERSION))) { + dev_err(dev, + "Intel Service Layer Driver: ATF version is not compatible for async operation\n"); + return -EINVAL; + } + + actrl->invoke_fn = stratix10_smc_1_2; + + ida_init(&actrl->async_id_pool); + + /** + * SIP_SVC_V1_CLIENT_ID is used by V1/stratix10_svc_send() clients + * for communicating with SDM synchronously. We need to restrict + * this in V3/stratix10_svc_async_send() usage to distinguish + * between V1 and V3 messages in El3 firmware. + */ + ret = ida_alloc_range(&actrl->async_id_pool, SIP_SVC_V1_CLIENT_ID, + SIP_SVC_V1_CLIENT_ID, GFP_KERNEL); + if (ret < 0) { + dev_err(dev, + "Intel Service Layer Driver: Error on reserving SIP_SVC_V1_CLIENT_ID\n"); + ida_destroy(&actrl->async_id_pool); + actrl->invoke_fn = NULL; + return -EADDRINUSE; + } + + spin_lock_init(&actrl->trx_list_lock); + hash_init(actrl->trx_list); + atomic_set(&actrl->common_achan_refcount, 0); + + actrl->initialized = true; + return 0; +} + +/** + * stratix10_svc_async_exit - Clean up and exit the asynchronous + * service controller + * @ctrl: Pointer to the stratix10_svc_controller structure + * + * This function performs the necessary cleanup for the asynchronous + * service controller. It checks if the controller is valid and if it + * has been initialized. It then locks the transaction list and safely + * removes and deallocates each handler in the list. The function also + * removes any asynchronous clients associated with the controller's + * channels and destroys the asynchronous ID pool. Finally, it resets + * the asynchronous ID pool and invoke function pointers to NULL. + * + * Return: 0 on success, -EINVAL if the controller is invalid or not + * initialized. + */ +static int stratix10_svc_async_exit(struct stratix10_svc_controller *ctrl) +{ + struct stratix10_svc_async_handler *handler; + struct stratix10_async_ctrl *actrl; + struct hlist_node *tmp; + int i; + + if (!ctrl) + return -EINVAL; + + actrl = &ctrl->actrl; + + if (!actrl->initialized) + return -EINVAL; + + actrl->initialized = false; + + scoped_guard(spinlock_bh, &actrl->trx_list_lock) { + hash_for_each_safe(actrl->trx_list, i, tmp, handler, next) { + ida_free(&handler->achan->job_id_pool, + STRATIX10_GET_JOBID(handler->transaction_id)); + hash_del(&handler->next); + kfree(handler); + } + } + + for (i = 0; i < SVC_NUM_CHANNEL; i++) { + if (ctrl->chans[i].async_chan) { + stratix10_svc_remove_async_client(&ctrl->chans[i]); + ctrl->chans[i].async_chan = NULL; + } + } + + ida_destroy(&actrl->async_id_pool); + actrl->invoke_fn = NULL; + + return 0; +} + /** * stratix10_svc_free_channel() - free service channel * @chan: service channel to be freed @@ -1197,11 +1834,18 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) controller->invoke_fn = invoke_fn; init_completion(&controller->complete_status); + ret = stratix10_svc_async_init(controller); + if (ret) { + dev_dbg(dev, "Intel Service Layer Driver: Error on stratix10_svc_async_init %d\n", + ret); + goto err_destroy_pool; + } + fifo_size = sizeof(struct stratix10_svc_data) * SVC_NUM_DATA_IN_FIFO; ret = kfifo_alloc(&controller->svc_fifo, fifo_size, GFP_KERNEL); if (ret) { dev_err(dev, "failed to allocate FIFO\n"); - goto err_destroy_pool; + goto err_async_exit; } spin_lock_init(&controller->svc_fifo_lock); @@ -1277,6 +1921,8 @@ err_unregister_rsu_dev: platform_device_unregister(svc->stratix10_svc_rsu); err_free_kfifo: kfifo_free(&controller->svc_fifo); +err_async_exit: + stratix10_svc_async_exit(controller); err_destroy_pool: gen_pool_destroy(genpool); return ret; @@ -1287,6 +1933,8 @@ static void stratix10_svc_drv_remove(struct platform_device *pdev) struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev); struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev); + stratix10_svc_async_exit(ctrl); + of_platform_depopulate(ctrl->dev); platform_device_unregister(svc->intel_svc_fcs); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 7306dd243b2a..3995d5d70cce 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SMC_H @@ -47,6 +48,10 @@ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \ ARM_SMCCC_OWNER_SIP, (func_num)) +#define INTEL_SIP_SMC_ASYNC_VAL(func_name) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_SIP, (func_name)) + /** * Return values in INTEL_SIP_SMC_* call * @@ -654,4 +659,24 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_HWMON_READVOLT \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) +/** + * Request INTEL_SIP_SMC_ASYNC_POLL + * Async call used by service driver at EL1 to query mailbox response from SDM. + * + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_POLL + * a1 transaction job id + * a2-17 will be used to return the response data + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1-17 will contain the response values from mailbox for the previous send + * transaction + * Or + * a0 INTEL_SIP_SMC_STATUS_NO_RESPONSE + * a1-17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) +#define INTEL_SIP_SMC_ASYNC_POLL \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 520004a5f15d..532dd4bd76dd 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SVC_CLIENT_H @@ -290,5 +291,92 @@ int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg); * request process. */ void stratix10_svc_done(struct stratix10_svc_chan *chan); + +/** + * typedef async_callback_t - A type definition for an asynchronous callback function. + * + * This type defines a function pointer for an asynchronous callback. + * The callback function takes a single argument, which is a pointer to + * user-defined data. + * + * @param cb_arg A pointer to user-defined data passed to the callback function. + */ +typedef void (*async_callback_t)(void *cb_arg); + +/** + * stratix10_svc_add_async_client - Add an asynchronous client to a Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * @use_unique_clientid: Boolean flag indicating whether to use a unique client ID. + * + * This function registers an asynchronous client with the specified Stratix 10 + * service channel. If the use_unique_clientid flag is set to true, a unique client + * ID will be assigned to the client. + * + * Return: 0 on success, or a negative error code on failure: + * -EINVAL if the channel is NULL or the async controller is not initialized. + * -EALREADY if the async channel is already allocated. + * -ENOMEM if memory allocation fails. + * Other negative values if ID allocation fails + */ +int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, bool use_unique_clientid); + +/** + * stratix10_svc_remove_async_client - Remove an asynchronous client from the Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * + * This function removes an asynchronous client from the specified Stratix 10 service channel. + * It is typically used to clean up and release resources associated with the client. + * + * Return: 0 on success, -EINVAL if the channel or asynchronous channel is invalid. + */ +int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan); + +/** + * stratix10_svc_async_send - Send an asynchronous message to the SDM mailbox + * in EL3 secure firmware. + * @chan: Pointer to the service channel structure. + * @msg: Pointer to the message to be sent. + * @handler: Pointer to the handler object used by caller to track the transaction. + * @cb: Callback function to be called upon completion. + * @cb_arg: Argument to be passed to the callback function. + * + * This function sends a message asynchronously to the SDM mailbox in EL3 secure firmware. + * and registers a callback function to be invoked when the operation completes. + * + * Return: 0 on success,and negative error codes on failure. + */ +int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, void **handler, + async_callback_t cb, void *cb_arg); + +/** + * stratix10_svc_async_poll - Polls the status of an asynchronous service request. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being polled. + * @data: Pointer to the callback data structure to be filled with the result. + * + * This function checks the status of an asynchronous service request + * and fills the provided callback data structure with the result. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid or if the + * async controller is not initialized, -EAGAIN if the transaction is + * still in progress, or other negative error codes on failure. + */ +int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, void *tx_handle, + struct stratix10_svc_cb_data *data); + +/** + * stratix10_svc_async_done - Complete an asynchronous transaction + * @chan: Pointer to the service channel structure + * @tx_handle: Pointer to the transaction handle + * + * This function completes an asynchronous transaction by removing the + * transaction from the hash table and deallocating the associated resources. + * + * Return: 0 on success, -EINVAL on invalid input or errors. + */ +int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle); + #endif -- cgit v1.2.3 From ec52379341a1209826c3e0ae53674393724d2071 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:42 +0800 Subject: firmware: stratix10-svc: Add support for RSU commands in asynchronous framework Integrate Remote System Update(RSU) service commands into the asynchronous framework for communicating with SDM. This allows the RSU commands to be processed asynchronously, improving the responsiveness of the Stratix10 service channel. The asynchronous framework now supports the following RSU commands: * COMMAND_RSU_GET_SPT_TABLE * COMMAND_RSU_STATUS * COMMAND_RSU_NOTIFY Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- drivers/firmware/stratix10-svc.c | 72 ++++++++++++++++++++++ include/linux/firmware/intel/stratix10-smc.h | 52 ++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 4 ++ 3 files changed, 128 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index 14bfa36a58ed..3acfa067c5dd 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -90,6 +90,12 @@ #define STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(transaction_id) \ (FIELD_PREP(STRATIX10_TRANS_ID_FIELD, transaction_id)) +/* 10-bit mask for extracting the SDM status code */ +#define STRATIX10_SDM_STATUS_MASK GENMASK(9, 0) +/* Macro to get the SDM mailbox error status */ +#define STRATIX10_GET_SDM_STATUS_CODE(status) \ + (FIELD_GET(STRATIX10_SDM_STATUS_MASK, status)) + typedef void (svc_invoke_fn)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, @@ -1273,6 +1279,16 @@ int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, STRATIX10_SIP_SMC_SET_TRANSACTIONID_X1(handle->transaction_id); switch (p_msg->command) { + case COMMAND_RSU_GET_SPT_TABLE: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_SPT; + break; + case COMMAND_RSU_STATUS: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS; + break; + case COMMAND_RSU_NOTIFY: + args.a0 = INTEL_SIP_SMC_ASYNC_RSU_NOTIFY; + args.a2 = p_msg->arg[0]; + break; default: dev_err(ctrl->dev, "Invalid command ,%d\n", p_msg->command); ret = -EINVAL; @@ -1326,6 +1342,56 @@ deallocate_id: return ret; } EXPORT_SYMBOL_GPL(stratix10_svc_async_send); + +/** + * stratix10_svc_async_prepare_response - Prepare the response data for + * an asynchronous transaction. + * @chan: Pointer to the service channel structure. + * @handle: Pointer to the asynchronous handler structure. + * @data: Pointer to the callback data structure. + * + * This function prepares the response data for an asynchronous transaction. It + * extracts the response data from the SMC response structure and stores it in + * the callback data structure. The function also logs the completion of the + * asynchronous transaction. + * + * Return: 0 on success, -ENOENT if the command is invalid + */ +static int stratix10_svc_async_prepare_response(struct stratix10_svc_chan *chan, + struct stratix10_svc_async_handler *handle, + struct stratix10_svc_cb_data *data) +{ + struct stratix10_svc_client_msg *p_msg = + (struct stratix10_svc_client_msg *)handle->msg; + struct stratix10_svc_controller *ctrl = chan->ctrl; + + data->status = STRATIX10_GET_SDM_STATUS_CODE(handle->res.a1); + + switch (p_msg->command) { + case COMMAND_RSU_NOTIFY: + break; + case COMMAND_RSU_GET_SPT_TABLE: + data->kaddr1 = (void *)&handle->res.a2; + data->kaddr2 = (void *)&handle->res.a3; + break; + case COMMAND_RSU_STATUS: + /* COMMAND_RSU_STATUS has more elements than the cb_data + * can acomodate, so passing the response structure to the + * response function to be handled before done command is + * executed by the client. + */ + data->kaddr1 = (void *)&handle->res; + break; + + default: + dev_alert(ctrl->dev, "Invalid command\n ,%d", p_msg->command); + return -ENOENT; + } + dev_dbg(ctrl->dev, "Async message completed transaction_id 0x%02x\n", + handle->transaction_id); + return 0; +} + /** * stratix10_svc_async_poll - Polls the status of an asynchronous * transaction. @@ -1355,6 +1421,7 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, struct stratix10_svc_controller *ctrl; struct stratix10_async_ctrl *actrl; struct stratix10_async_chan *achan; + int ret; if (!chan || !tx_handle || !data) return -EINVAL; @@ -1386,6 +1453,11 @@ int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, memset(data, 0, sizeof(*data)); if (handle->res.a0 == INTEL_SIP_SMC_STATUS_OK) { + ret = stratix10_svc_async_prepare_response(chan, handle, data); + if (ret) { + dev_err(ctrl->dev, "Error in preparation of response,%d\n", ret); + WARN_ON_ONCE(1); + } return 0; } else if (handle->res.a0 == INTEL_SIP_SMC_STATUS_BUSY) { dev_dbg(ctrl->dev, "async message is still in progress\n"); diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 3995d5d70cce..935dba3633b5 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -679,4 +679,56 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) #define INTEL_SIP_SMC_ASYNC_POLL \ INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * Async call to get RSU SPT from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT (0xEA) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_SPT \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * Async call to get RSU error status from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS (0xEB) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * Async call to send NOTIFY value to SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * a1 transaction job id + * a2 notify value + * a3-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY (0xEC) +#define INTEL_SIP_SMC_ASYNC_RSU_NOTIFY \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 532dd4bd76dd..1bcc56d14080 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -128,6 +128,9 @@ struct stratix10_svc_chan; * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status * return status is SVC_STATUS_OK or SVC_STATUS_ERROR * + * @COMMAND_RSU_GET_SPT_TABLE: query firmware for SPT table + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR + * * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware, * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM * @@ -162,6 +165,7 @@ enum stratix10_svc_command_code { COMMAND_RSU_DCMF_VERSION, COMMAND_RSU_DCMF_STATUS, COMMAND_FIRMWARE_VERSION, + COMMAND_RSU_GET_SPT_TABLE, /* for FCS */ COMMAND_FCS_REQUEST_SERVICE = 20, COMMAND_FCS_SEND_CERTIFICATE, -- cgit v1.2.3 From 36640d21fdfe0152c96e6cb9b58e3336291dfbaa Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 29 Oct 2025 13:34:49 +0530 Subject: PCI: Export pci_get_host_bridge_device() for use by pci-keystone The pci-keystone.c driver uses the 'pci_get_host_bridge_device()' helper. Export it in preparation for enabling the pci-keystone.c driver to be built as a loadable module. Signed-off-by: Siddharth Vadapalli Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251029080547.1253757-2-s-vadapalli@ti.com --- drivers/pci/host-bridge.c | 1 + include/linux/pci.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index afa50b446567..be5ef6516cff 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -33,6 +33,7 @@ struct device *pci_get_host_bridge_device(struct pci_dev *dev) kobject_get(&bridge->kobj); return bridge; } +EXPORT_SYMBOL_GPL(pci_get_host_bridge_device); void pci_put_host_bridge_device(struct device *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..b253cbc27d36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -646,6 +646,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv); struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev, size_t priv); void pci_free_host_bridge(struct pci_host_bridge *bridge); +struct device *pci_get_host_bridge_device(struct pci_dev *dev); struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); void pci_set_host_bridge_release(struct pci_host_bridge *bridge, -- cgit v1.2.3 From e5b5f8b7c26f72fe86b59979e51d8e6cf36ea903 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:40 -0800 Subject: PCI/TSM: Drop stub for pci_tsm_doe_transfer() Just like pci_tsm_pf0_{con,de}structor(), in the CONFIG_PCI_TSM=n case there should be no callers of pci_tsm_doe_transfer(). Reported-by: Xu Yilun Closes: http://lore.kernel.org/aRFfk14DJWEVhC/R@yilunxu-OptiPlex-7050 Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-tsm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index e921d30f9b6c..d7b078d5e272 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -147,11 +147,5 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } -static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, - const void *req, size_t req_sz, - void *resp, size_t resp_sz) -{ - return -ENXIO; -} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c16af019d9d6d23f211c82b5561f2ecd2a7dff54 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:41 -0800 Subject: resource: Introduce resource_assigned() for discerning active resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PCI bridge resource lifecycle involves both a "request" and "assign" phase. At any point in time that resource may not yet be assigned, or may have failed to assign (because it does not fit). There are multiple conventions to determine when assignment has not completed: IORESOURCE_UNSET, IORESOURCE_DISABLED, and checking whether the resource is parented. In code paths that are known to not be racing assignment, e.g. post subsys_initcall(), the most reliable method to judge that a bridge resource is assigned is to check the resource is parented [1]. Introduce a resource_assigned() helper for this purpose. Link: http://lore.kernel.org/2b9f7f7b-d6a4-be59-14d4-7b4ffccfe373@linux.intel.com [1] Suggested-by: Ilpo Järvinen Cc: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/ioport.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index e8b2d6aa4013..9afa30f9346f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -334,6 +334,15 @@ static inline bool resource_union(const struct resource *r1, const struct resour return true; } +/* + * Check if this resource is added to a resource tree or detached. Caller is + * responsible for not racing assignment. + */ +static inline bool resource_assigned(struct resource *res) +{ + return res->parent; +} + int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint); -- cgit v1.2.3 From a97fbc3ee3e2a536fafaff04f21f45472db71769 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 29 Oct 2025 17:33:30 +0100 Subject: syscore: Pass context data to callbacks Several drivers can benefit from registering per-instance data along with the syscore operations. To achieve this, move the modifiable fields out of the syscore_ops structure and into a separate struct syscore that can be registered with the framework. Add a void * driver data field for drivers to store contextual data that will be passed to the syscore ops. Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Thierry Reding --- arch/arm/mach-exynos/mcpm-exynos.c | 12 +++-- arch/arm/mach-exynos/suspend.c | 48 +++++++++++------- arch/arm/mach-pxa/generic.h | 6 +-- arch/arm/mach-pxa/irq.c | 10 ++-- arch/arm/mach-pxa/mfp-pxa2xx.c | 10 ++-- arch/arm/mach-pxa/mfp-pxa3xx.c | 10 ++-- arch/arm/mach-pxa/pxa25x.c | 4 +- arch/arm/mach-pxa/pxa27x.c | 4 +- arch/arm/mach-pxa/pxa3xx.c | 4 +- arch/arm/mach-pxa/smemc.c | 12 +++-- arch/arm/mach-s3c/irq-pm-s3c64xx.c | 12 +++-- arch/arm/mach-s5pv210/pm.c | 10 ++-- arch/arm/mach-versatile/integrator_ap.c | 12 +++-- arch/arm/mm/cache-b15-rac.c | 12 +++-- arch/loongarch/kernel/smp.c | 12 +++-- arch/mips/alchemy/common/dbdma.c | 12 +++-- arch/mips/alchemy/common/irq.c | 24 ++++++--- arch/mips/alchemy/common/usb.c | 12 +++-- arch/mips/pci/pci-alchemy.c | 16 +++--- arch/powerpc/platforms/cell/spu_base.c | 10 ++-- arch/powerpc/platforms/powermac/pic.c | 12 +++-- arch/powerpc/sysdev/fsl_lbc.c | 12 +++-- arch/powerpc/sysdev/fsl_pci.c | 12 +++-- arch/powerpc/sysdev/ipic.c | 12 +++-- arch/powerpc/sysdev/mpic.c | 14 ++++-- arch/powerpc/sysdev/mpic_timer.c | 10 ++-- arch/sh/mm/pmb.c | 10 ++-- arch/x86/events/amd/ibs.c | 12 +++-- arch/x86/hyperv/hv_init.c | 12 +++-- arch/x86/kernel/amd_gart_64.c | 10 ++-- arch/x86/kernel/apic/apic.c | 12 +++-- arch/x86/kernel/apic/io_apic.c | 17 +++++-- arch/x86/kernel/cpu/aperfmperf.c | 20 +++++--- arch/x86/kernel/cpu/intel_epb.c | 16 +++--- arch/x86/kernel/cpu/mce/core.c | 14 ++++-- arch/x86/kernel/cpu/microcode/core.c | 15 ++++-- arch/x86/kernel/cpu/mtrr/legacy.c | 12 +++-- arch/x86/kernel/cpu/umwait.c | 10 ++-- arch/x86/kernel/i8237.c | 10 ++-- arch/x86/kernel/i8259.c | 14 ++++-- arch/x86/kernel/kvm.c | 12 +++-- drivers/acpi/pci_link.c | 10 ++-- drivers/acpi/sleep.c | 12 +++-- drivers/base/firmware_loader/main.c | 12 +++-- drivers/base/syscore.c | 82 ++++++++++++++++--------------- drivers/bus/mvebu-mbus.c | 16 +++--- drivers/clk/at91/pmc.c | 12 +++-- drivers/clk/imx/clk-vf610.c | 12 +++-- drivers/clk/ingenic/jz4725b-cgu.c | 2 +- drivers/clk/ingenic/jz4740-cgu.c | 2 +- drivers/clk/ingenic/jz4755-cgu.c | 2 +- drivers/clk/ingenic/jz4760-cgu.c | 2 +- drivers/clk/ingenic/jz4770-cgu.c | 2 +- drivers/clk/ingenic/jz4780-cgu.c | 2 +- drivers/clk/ingenic/pm.c | 14 ++++-- drivers/clk/ingenic/pm.h | 2 +- drivers/clk/ingenic/tcu.c | 12 +++-- drivers/clk/ingenic/x1000-cgu.c | 2 +- drivers/clk/ingenic/x1830-cgu.c | 2 +- drivers/clk/mvebu/common.c | 12 +++-- drivers/clk/rockchip/clk-rk3288.c | 12 +++-- drivers/clk/samsung/clk-s5pv210-audss.c | 12 +++-- drivers/clk/samsung/clk.c | 12 +++-- drivers/clk/tegra/clk-tegra210.c | 12 +++-- drivers/clocksource/timer-armada-370-xp.c | 12 +++-- drivers/cpuidle/cpuidle-psci.c | 12 +++-- drivers/gpio/gpio-mxc.c | 12 +++-- drivers/gpio/gpio-pxa.c | 12 +++-- drivers/gpio/gpio-sa1100.c | 12 +++-- drivers/hv/vmbus_drv.c | 14 ++++-- drivers/iommu/amd/init.c | 16 +++--- drivers/iommu/intel/iommu.c | 12 +++-- drivers/irqchip/exynos-combiner.c | 14 ++++-- drivers/irqchip/irq-armada-370-xp.c | 12 +++-- drivers/irqchip/irq-bcm7038-l1.c | 12 +++-- drivers/irqchip/irq-gic-v3-its.c | 12 +++-- drivers/irqchip/irq-i8259.c | 12 +++-- drivers/irqchip/irq-imx-gpcv2.c | 16 +++--- drivers/irqchip/irq-loongson-eiointc.c | 12 +++-- drivers/irqchip/irq-loongson-htpic.c | 10 ++-- drivers/irqchip/irq-loongson-htvec.c | 12 +++-- drivers/irqchip/irq-loongson-pch-lpc.c | 12 +++-- drivers/irqchip/irq-loongson-pch-pic.c | 12 +++-- drivers/irqchip/irq-mchp-eic.c | 12 +++-- drivers/irqchip/irq-mst-intc.c | 12 +++-- drivers/irqchip/irq-mtk-cirq.c | 12 +++-- drivers/irqchip/irq-renesas-rzg2l.c | 12 +++-- drivers/irqchip/irq-sa11x0.c | 12 +++-- drivers/irqchip/irq-sifive-plic.c | 12 +++-- drivers/irqchip/irq-sun6i-r.c | 18 ++++--- drivers/irqchip/irq-tegra.c | 12 +++-- drivers/irqchip/irq-vic.c | 12 +++-- drivers/leds/trigger/ledtrig-cpu.c | 14 ++++-- drivers/macintosh/via-pmu.c | 12 +++-- drivers/power/reset/sc27xx-poweroff.c | 10 ++-- drivers/sh/clk/core.c | 10 ++-- drivers/sh/intc/core.c | 12 +++-- drivers/soc/bcm/brcmstb/biuctrl.c | 12 +++-- drivers/soc/tegra/pmc.c | 17 ++++--- drivers/thermal/intel/intel_hfi.c | 12 +++-- drivers/xen/xen-acpi-processor.c | 12 +++-- include/linux/syscore_ops.h | 15 ++++-- kernel/cpu_pm.c | 12 +++-- kernel/irq/generic-chip.c | 14 ++++-- kernel/irq/pm.c | 11 +++-- kernel/printk/printk.c | 11 +++-- kernel/time/sched_clock.c | 22 +++++++-- kernel/time/timekeeping.c | 22 +++++++-- virt/kvm/kvm_main.c | 18 ++++--- 109 files changed, 898 insertions(+), 470 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c index fd0dbeb93357..cb7d8a7b14e0 100644 --- a/arch/arm/mach-exynos/mcpm-exynos.c +++ b/arch/arm/mach-exynos/mcpm-exynos.c @@ -215,7 +215,7 @@ static const struct of_device_id exynos_dt_mcpm_match[] = { {}, }; -static void exynos_mcpm_setup_entry_point(void) +static void exynos_mcpm_setup_entry_point(void *data) { /* * U-Boot SPL is hardcoded to jump to the start of ns_sram_base_addr @@ -228,10 +228,14 @@ static void exynos_mcpm_setup_entry_point(void) __raw_writel(__pa_symbol(mcpm_entry_point), ns_sram_base_addr + 8); } -static struct syscore_ops exynos_mcpm_syscore_ops = { +static const struct syscore_ops exynos_mcpm_syscore_ops = { .resume = exynos_mcpm_setup_entry_point, }; +static struct syscore exynos_mcpm_syscore = { + .ops = &exynos_mcpm_syscore_ops, +}; + static int __init exynos_mcpm_init(void) { struct device_node *node; @@ -300,9 +304,9 @@ static int __init exynos_mcpm_init(void) pmu_raw_writel(value, EXYNOS_COMMON_OPTION(i)); } - exynos_mcpm_setup_entry_point(); + exynos_mcpm_setup_entry_point(NULL); - register_syscore_ops(&exynos_mcpm_syscore_ops); + register_syscore(&exynos_mcpm_syscore); return ret; } diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c index 150a1e56dcae..22d723553f62 100644 --- a/arch/arm/mach-exynos/suspend.c +++ b/arch/arm/mach-exynos/suspend.c @@ -53,9 +53,9 @@ struct exynos_pm_data { void (*pm_prepare)(void); void (*pm_resume_prepare)(void); - void (*pm_resume)(void); - int (*pm_suspend)(void); int (*cpu_suspend)(unsigned long); + + const struct syscore_ops *syscore_ops; }; /* Used only on Exynos542x/5800 */ @@ -376,7 +376,7 @@ static void exynos5420_pm_prepare(void) } -static int exynos_pm_suspend(void) +static int exynos_pm_suspend(void *data) { exynos_pm_central_suspend(); @@ -390,7 +390,7 @@ static int exynos_pm_suspend(void) return 0; } -static int exynos5420_pm_suspend(void) +static int exynos5420_pm_suspend(void *data) { u32 this_cluster; @@ -408,7 +408,7 @@ static int exynos5420_pm_suspend(void) return 0; } -static void exynos_pm_resume(void) +static void exynos_pm_resume(void *data) { u32 cpuid = read_cpuid_part(); @@ -429,7 +429,7 @@ early_wakeup: exynos_set_delayed_reset_assertion(true); } -static void exynos3250_pm_resume(void) +static void exynos3250_pm_resume(void *data) { u32 cpuid = read_cpuid_part(); @@ -473,7 +473,7 @@ static void exynos5420_prepare_pm_resume(void) } } -static void exynos5420_pm_resume(void) +static void exynos5420_pm_resume(void *data) { unsigned long tmp; @@ -596,41 +596,52 @@ static const struct platform_suspend_ops exynos_suspend_ops = { .valid = suspend_valid_only_mem, }; +static const struct syscore_ops exynos3250_syscore_ops = { + .suspend = exynos_pm_suspend, + .resume = exynos3250_pm_resume, +}; + static const struct exynos_pm_data exynos3250_pm_data = { .wkup_irq = exynos3250_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos3250_pm_resume, .pm_prepare = exynos3250_pm_prepare, .cpu_suspend = exynos3250_cpu_suspend, + .syscore_ops = &exynos3250_syscore_ops, +}; + +static const struct syscore_ops exynos_syscore_ops = { + .suspend = exynos_pm_suspend, + .resume = exynos_pm_resume, }; static const struct exynos_pm_data exynos4_pm_data = { .wkup_irq = exynos4_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos_pm_resume, .pm_prepare = exynos_pm_prepare, .cpu_suspend = exynos_cpu_suspend, + .syscore_ops = &exynos_syscore_ops, }; static const struct exynos_pm_data exynos5250_pm_data = { .wkup_irq = exynos5250_wkup_irq, .wake_disable_mask = ((0xFF << 8) | (0x1F << 1)), - .pm_suspend = exynos_pm_suspend, - .pm_resume = exynos_pm_resume, .pm_prepare = exynos_pm_prepare, .cpu_suspend = exynos_cpu_suspend, + .syscore_ops = &exynos_syscore_ops, +}; + +static const struct syscore_ops exynos5420_syscore_ops = { + .resume = exynos5420_pm_resume, + .suspend = exynos5420_pm_suspend, }; static const struct exynos_pm_data exynos5420_pm_data = { .wkup_irq = exynos5250_wkup_irq, .wake_disable_mask = (0x7F << 7) | (0x1F << 1), .pm_resume_prepare = exynos5420_prepare_pm_resume, - .pm_resume = exynos5420_pm_resume, - .pm_suspend = exynos5420_pm_suspend, .pm_prepare = exynos5420_pm_prepare, .cpu_suspend = exynos5420_cpu_suspend, + .syscore_ops = &exynos5420_syscore_ops, }; static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = { @@ -656,7 +667,7 @@ static const struct of_device_id exynos_pmu_of_device_ids[] __initconst = { { /*sentinel*/ }, }; -static struct syscore_ops exynos_pm_syscore_ops; +static struct syscore exynos_pm_syscore; void __init exynos_pm_init(void) { @@ -684,10 +695,9 @@ void __init exynos_pm_init(void) tmp |= pm_data->wake_disable_mask; pmu_raw_writel(tmp, S5P_WAKEUP_MASK); - exynos_pm_syscore_ops.suspend = pm_data->pm_suspend; - exynos_pm_syscore_ops.resume = pm_data->pm_resume; + exynos_pm_syscore.ops = pm_data->syscore_ops; - register_syscore_ops(&exynos_pm_syscore_ops); + register_syscore(&exynos_pm_syscore); suspend_set_ops(&exynos_suspend_ops); /* diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index c9c2c46ecead..caad4fca8de3 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -34,9 +34,9 @@ extern void __init pxa27x_map_io(void); extern void __init pxa3xx_init_irq(void); extern void __init pxa3xx_map_io(void); -extern struct syscore_ops pxa_irq_syscore_ops; -extern struct syscore_ops pxa2xx_mfp_syscore_ops; -extern struct syscore_ops pxa3xx_mfp_syscore_ops; +extern struct syscore pxa_irq_syscore; +extern struct syscore pxa2xx_mfp_syscore; +extern struct syscore pxa3xx_mfp_syscore; void __init pxa_set_ffuart_info(void *info); void __init pxa_set_btuart_info(void *info); diff --git a/arch/arm/mach-pxa/irq.c b/arch/arm/mach-pxa/irq.c index 5bfce8aa4102..99acebbbf065 100644 --- a/arch/arm/mach-pxa/irq.c +++ b/arch/arm/mach-pxa/irq.c @@ -178,7 +178,7 @@ void __init pxa_init_irq(int irq_nr, int (*fn)(struct irq_data *, unsigned int)) static unsigned long saved_icmr[MAX_INTERNAL_IRQS/32]; static unsigned long saved_ipr[MAX_INTERNAL_IRQS]; -static int pxa_irq_suspend(void) +static int pxa_irq_suspend(void *data) { int i; @@ -197,7 +197,7 @@ static int pxa_irq_suspend(void) return 0; } -static void pxa_irq_resume(void) +static void pxa_irq_resume(void *data) { int i; @@ -219,11 +219,15 @@ static void pxa_irq_resume(void) #define pxa_irq_resume NULL #endif -struct syscore_ops pxa_irq_syscore_ops = { +static const struct syscore_ops pxa_irq_syscore_ops = { .suspend = pxa_irq_suspend, .resume = pxa_irq_resume, }; +struct syscore pxa_irq_syscore = { + .ops = &pxa_irq_syscore_ops, +}; + #ifdef CONFIG_OF static const struct of_device_id intc_ids[] __initconst = { { .compatible = "marvell,pxa-intc", }, diff --git a/arch/arm/mach-pxa/mfp-pxa2xx.c b/arch/arm/mach-pxa/mfp-pxa2xx.c index f5a3d890f682..d1347055fbe4 100644 --- a/arch/arm/mach-pxa/mfp-pxa2xx.c +++ b/arch/arm/mach-pxa/mfp-pxa2xx.c @@ -346,7 +346,7 @@ static unsigned long saved_gpdr[4]; static unsigned long saved_gplr[4]; static unsigned long saved_pgsr[4]; -static int pxa2xx_mfp_suspend(void) +static int pxa2xx_mfp_suspend(void *data) { int i; @@ -385,7 +385,7 @@ static int pxa2xx_mfp_suspend(void) return 0; } -static void pxa2xx_mfp_resume(void) +static void pxa2xx_mfp_resume(void *data) { int i; @@ -404,11 +404,15 @@ static void pxa2xx_mfp_resume(void) #define pxa2xx_mfp_resume NULL #endif -struct syscore_ops pxa2xx_mfp_syscore_ops = { +static const struct syscore_ops pxa2xx_mfp_syscore_ops = { .suspend = pxa2xx_mfp_suspend, .resume = pxa2xx_mfp_resume, }; +struct syscore pxa2xx_mfp_syscore = { + .ops = &pxa2xx_mfp_syscore_ops, +}; + static int __init pxa2xx_mfp_init(void) { int i; diff --git a/arch/arm/mach-pxa/mfp-pxa3xx.c b/arch/arm/mach-pxa/mfp-pxa3xx.c index d16ab7451efe..fe7498fbb62b 100644 --- a/arch/arm/mach-pxa/mfp-pxa3xx.c +++ b/arch/arm/mach-pxa/mfp-pxa3xx.c @@ -27,13 +27,13 @@ * a pull-down mode if they're an active low chip select, and we're * just entering standby. */ -static int pxa3xx_mfp_suspend(void) +static int pxa3xx_mfp_suspend(void *data) { mfp_config_lpm(); return 0; } -static void pxa3xx_mfp_resume(void) +static void pxa3xx_mfp_resume(void *data) { mfp_config_run(); @@ -49,7 +49,11 @@ static void pxa3xx_mfp_resume(void) #define pxa3xx_mfp_resume NULL #endif -struct syscore_ops pxa3xx_mfp_syscore_ops = { +static const struct syscore_ops pxa3xx_mfp_syscore_ops = { .suspend = pxa3xx_mfp_suspend, .resume = pxa3xx_mfp_resume, }; + +struct syscore pxa3xx_mfp_syscore = { + .ops = &pxa3xx_mfp_syscore_ops, +}; diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c index 03e34841fc00..70509a599814 100644 --- a/arch/arm/mach-pxa/pxa25x.c +++ b/arch/arm/mach-pxa/pxa25x.c @@ -235,8 +235,8 @@ static int __init pxa25x_init(void) pxa25x_init_pm(); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa2xx_mfp_syscore); if (!of_have_populated_dt()) { software_node_register(&pxa2xx_gpiochip_node); diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index f8382477d629..ff6361979038 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -337,8 +337,8 @@ static int __init pxa27x_init(void) pxa27x_init_pm(); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa2xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa2xx_mfp_syscore); if (!of_have_populated_dt()) { software_node_register(&pxa2xx_gpiochip_node); diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 1d1e5713464d..06c578ea658e 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -424,8 +424,8 @@ static int __init pxa3xx_init(void) if (cpu_is_pxa320()) enable_irq_wake(IRQ_WAKEUP1); - register_syscore_ops(&pxa_irq_syscore_ops); - register_syscore_ops(&pxa3xx_mfp_syscore_ops); + register_syscore(&pxa_irq_syscore); + register_syscore(&pxa3xx_mfp_syscore); } return ret; diff --git a/arch/arm/mach-pxa/smemc.c b/arch/arm/mach-pxa/smemc.c index 2d2a321d82f8..fb93a8f28356 100644 --- a/arch/arm/mach-pxa/smemc.c +++ b/arch/arm/mach-pxa/smemc.c @@ -18,7 +18,7 @@ static unsigned long msc[2]; static unsigned long sxcnfg, memclkcfg; static unsigned long csadrcfg[4]; -static int pxa3xx_smemc_suspend(void) +static int pxa3xx_smemc_suspend(void *data) { msc[0] = __raw_readl(MSC0); msc[1] = __raw_readl(MSC1); @@ -32,7 +32,7 @@ static int pxa3xx_smemc_suspend(void) return 0; } -static void pxa3xx_smemc_resume(void) +static void pxa3xx_smemc_resume(void *data) { __raw_writel(msc[0], MSC0); __raw_writel(msc[1], MSC1); @@ -46,11 +46,15 @@ static void pxa3xx_smemc_resume(void) __raw_writel(0x2, CSMSADRCFG); } -static struct syscore_ops smemc_syscore_ops = { +static const struct syscore_ops smemc_syscore_ops = { .suspend = pxa3xx_smemc_suspend, .resume = pxa3xx_smemc_resume, }; +static struct syscore smemc_syscore = { + .ops = &smemc_syscore_ops, +}; + static int __init smemc_init(void) { if (cpu_is_pxa3xx()) { @@ -64,7 +68,7 @@ static int __init smemc_init(void) */ __raw_writel(0x2, CSMSADRCFG); - register_syscore_ops(&smemc_syscore_ops); + register_syscore(&smemc_syscore); } return 0; diff --git a/arch/arm/mach-s3c/irq-pm-s3c64xx.c b/arch/arm/mach-s3c/irq-pm-s3c64xx.c index 4a1e935bada1..ab726c595001 100644 --- a/arch/arm/mach-s3c/irq-pm-s3c64xx.c +++ b/arch/arm/mach-s3c/irq-pm-s3c64xx.c @@ -58,7 +58,7 @@ static struct irq_grp_save { static u32 irq_uart_mask[SERIAL_SAMSUNG_UARTS]; -static int s3c64xx_irq_pm_suspend(void) +static int s3c64xx_irq_pm_suspend(void *data) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -79,7 +79,7 @@ static int s3c64xx_irq_pm_suspend(void) return 0; } -static void s3c64xx_irq_pm_resume(void) +static void s3c64xx_irq_pm_resume(void *data) { struct irq_grp_save *grp = eint_grp_save; int i; @@ -100,18 +100,22 @@ static void s3c64xx_irq_pm_resume(void) S3C_PMDBG("%s: IRQ configuration restored\n", __func__); } -static struct syscore_ops s3c64xx_irq_syscore_ops = { +static const struct syscore_ops s3c64xx_irq_syscore_ops = { .suspend = s3c64xx_irq_pm_suspend, .resume = s3c64xx_irq_pm_resume, }; +static struct syscore s3c64xx_irq_syscore = { + .ops = &s3c64xx_irq_syscore_ops, +}; + static __init int s3c64xx_syscore_init(void) { /* Appropriate drivers (pinctrl, uart) handle this when using DT. */ if (of_have_populated_dt() || !soc_is_s3c64xx()) return 0; - register_syscore_ops(&s3c64xx_irq_syscore_ops); + register_syscore(&s3c64xx_irq_syscore); return 0; } diff --git a/arch/arm/mach-s5pv210/pm.c b/arch/arm/mach-s5pv210/pm.c index 6fa70f787df4..fa270750364c 100644 --- a/arch/arm/mach-s5pv210/pm.c +++ b/arch/arm/mach-s5pv210/pm.c @@ -195,20 +195,24 @@ static const struct platform_suspend_ops s5pv210_suspend_ops = { /* * Syscore operations used to delay restore of certain registers. */ -static void s5pv210_pm_resume(void) +static void s5pv210_pm_resume(void *data) { s3c_pm_do_restore_core(s5pv210_core_save, ARRAY_SIZE(s5pv210_core_save)); } -static struct syscore_ops s5pv210_pm_syscore_ops = { +static const struct syscore_ops s5pv210_pm_syscore_ops = { .resume = s5pv210_pm_resume, }; +static struct syscore s5pv210_pm_syscore = { + .ops = &s5pv210_pm_syscore_ops, +}; + /* * Initialization entry point. */ void __init s5pv210_pm_init(void) { - register_syscore_ops(&s5pv210_pm_syscore_ops); + register_syscore(&s5pv210_pm_syscore); suspend_set_ops(&s5pv210_suspend_ops); } diff --git a/arch/arm/mach-versatile/integrator_ap.c b/arch/arm/mach-versatile/integrator_ap.c index 4bd6712e9f52..ee90d6619d0d 100644 --- a/arch/arm/mach-versatile/integrator_ap.c +++ b/arch/arm/mach-versatile/integrator_ap.c @@ -63,13 +63,13 @@ static void __init ap_map_io(void) #ifdef CONFIG_PM static unsigned long ic_irq_enable; -static int irq_suspend(void) +static int irq_suspend(void *data) { ic_irq_enable = readl(VA_IC_BASE + IRQ_ENABLE); return 0; } -static void irq_resume(void) +static void irq_resume(void *data) { /* disable all irq sources */ cm_clear_irqs(); @@ -83,14 +83,18 @@ static void irq_resume(void) #define irq_resume NULL #endif -static struct syscore_ops irq_syscore_ops = { +static const struct syscore_ops irq_syscore_ops = { .suspend = irq_suspend, .resume = irq_resume, }; +static struct syscore irq_syscore = { + .ops = &irq_syscore_ops, +}; + static int __init irq_syscore_init(void) { - register_syscore_ops(&irq_syscore_ops); + register_syscore(&irq_syscore); return 0; } diff --git a/arch/arm/mm/cache-b15-rac.c b/arch/arm/mm/cache-b15-rac.c index 6f63b90f9e1a..e7807356dfab 100644 --- a/arch/arm/mm/cache-b15-rac.c +++ b/arch/arm/mm/cache-b15-rac.c @@ -256,7 +256,7 @@ static int b15_rac_dead_cpu(unsigned int cpu) return 0; } -static int b15_rac_suspend(void) +static int b15_rac_suspend(void *data) { /* Suspend the read-ahead cache oeprations, forcing our cache * implementation to fallback to the regular ARMv7 calls. @@ -271,7 +271,7 @@ static int b15_rac_suspend(void) return 0; } -static void b15_rac_resume(void) +static void b15_rac_resume(void *data) { /* Coming out of a S3 suspend/resume cycle, the read-ahead cache * register RAC_CONFIG0_REG will be restored to its default value, make @@ -282,11 +282,15 @@ static void b15_rac_resume(void) clear_bit(RAC_SUSPENDED, &b15_rac_flags); } -static struct syscore_ops b15_rac_syscore_ops = { +static const struct syscore_ops b15_rac_syscore_ops = { .suspend = b15_rac_suspend, .resume = b15_rac_resume, }; +static struct syscore b15_rac_syscore = { + .ops = &b15_rac_syscore_ops, +}; + static int __init b15_rac_init(void) { struct device_node *dn, *cpu_dn; @@ -347,7 +351,7 @@ static int __init b15_rac_init(void) } if (IS_ENABLED(CONFIG_PM_SLEEP)) - register_syscore_ops(&b15_rac_syscore_ops); + register_syscore(&b15_rac_syscore); spin_lock(&rac_lock); reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG); diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 46036d98da75..8b2fcb3fb874 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -535,28 +535,32 @@ int hibernate_resume_nonboot_cpu_disable(void) */ #ifdef CONFIG_PM -static int loongson_ipi_suspend(void) +static int loongson_ipi_suspend(void *data) { return 0; } -static void loongson_ipi_resume(void) +static void loongson_ipi_resume(void *data) { iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_EN); } -static struct syscore_ops loongson_ipi_syscore_ops = { +static const struct syscore_ops loongson_ipi_syscore_ops = { .resume = loongson_ipi_resume, .suspend = loongson_ipi_suspend, }; +static struct syscore loongson_ipi_syscore = { + .ops = &loongson_ipi_syscore_ops, +}; + /* * Enable boot cpu ipi before enabling nonboot cpus * during syscore_resume. */ static int __init ipi_pm_init(void) { - register_syscore_ops(&loongson_ipi_syscore_ops); + register_syscore(&loongson_ipi_syscore); return 0; } diff --git a/arch/mips/alchemy/common/dbdma.c b/arch/mips/alchemy/common/dbdma.c index 6a3c890f7bbf..6c2c2010bbae 100644 --- a/arch/mips/alchemy/common/dbdma.c +++ b/arch/mips/alchemy/common/dbdma.c @@ -982,7 +982,7 @@ u32 au1xxx_dbdma_put_dscr(u32 chanid, au1x_ddma_desc_t *dscr) static unsigned long alchemy_dbdma_pm_data[NUM_DBDMA_CHANS + 1][6]; -static int alchemy_dbdma_suspend(void) +static int alchemy_dbdma_suspend(void *data) { int i; void __iomem *addr; @@ -1019,7 +1019,7 @@ static int alchemy_dbdma_suspend(void) return 0; } -static void alchemy_dbdma_resume(void) +static void alchemy_dbdma_resume(void *data) { int i; void __iomem *addr; @@ -1044,11 +1044,15 @@ static void alchemy_dbdma_resume(void) } } -static struct syscore_ops alchemy_dbdma_syscore_ops = { +static const struct syscore_ops alchemy_dbdma_syscore_ops = { .suspend = alchemy_dbdma_suspend, .resume = alchemy_dbdma_resume, }; +static struct syscore alchemy_dbdma_syscore = { + .ops = &alchemy_dbdma_syscore_ops, +}; + static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable) { int ret; @@ -1071,7 +1075,7 @@ static int __init dbdma_setup(unsigned int irq, dbdev_tab_t *idtable) printk(KERN_ERR "Cannot grab DBDMA interrupt!\n"); else { dbdma_initialized = 1; - register_syscore_ops(&alchemy_dbdma_syscore_ops); + register_syscore(&alchemy_dbdma_syscore); } return ret; diff --git a/arch/mips/alchemy/common/irq.c b/arch/mips/alchemy/common/irq.c index da9f9220048f..2403afcd2fb9 100644 --- a/arch/mips/alchemy/common/irq.c +++ b/arch/mips/alchemy/common/irq.c @@ -758,7 +758,7 @@ static inline void alchemy_ic_resume_one(void __iomem *base, unsigned long *d) wmb(); } -static int alchemy_ic_suspend(void) +static int alchemy_ic_suspend(void *data) { alchemy_ic_suspend_one((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR), alchemy_gpic_pmdata); @@ -767,7 +767,7 @@ static int alchemy_ic_suspend(void) return 0; } -static void alchemy_ic_resume(void) +static void alchemy_ic_resume(void *data) { alchemy_ic_resume_one((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR), &alchemy_gpic_pmdata[7]); @@ -775,7 +775,7 @@ static void alchemy_ic_resume(void) alchemy_gpic_pmdata); } -static int alchemy_gpic_suspend(void) +static int alchemy_gpic_suspend(void *data) { void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR); int i; @@ -806,7 +806,7 @@ static int alchemy_gpic_suspend(void) return 0; } -static void alchemy_gpic_resume(void) +static void alchemy_gpic_resume(void *data) { void __iomem *base = (void __iomem *)KSEG1ADDR(AU1300_GPIC_PHYS_ADDR); int i; @@ -837,16 +837,24 @@ static void alchemy_gpic_resume(void) wmb(); } -static struct syscore_ops alchemy_ic_pmops = { +static const struct syscore_ops alchemy_ic_pmops = { .suspend = alchemy_ic_suspend, .resume = alchemy_ic_resume, }; -static struct syscore_ops alchemy_gpic_pmops = { +static struct syscore alchemy_ic_pm = { + .ops = &alchemy_ic_pmops, +}; + +static const struct syscore_ops alchemy_gpic_pmops = { .suspend = alchemy_gpic_suspend, .resume = alchemy_gpic_resume, }; +static struct syscore alchemy_gpic_pm = { + .ops = &alchemy_gpic_pmops, +}; + /******************************************************************************/ /* create chained handlers for the 4 IC requests to the MIPS IRQ ctrl */ @@ -880,7 +888,7 @@ static void __init au1000_init_irq(struct alchemy_irqmap *map) ic_init((void __iomem *)KSEG1ADDR(AU1000_IC0_PHYS_ADDR)); ic_init((void __iomem *)KSEG1ADDR(AU1000_IC1_PHYS_ADDR)); - register_syscore_ops(&alchemy_ic_pmops); + register_syscore(&alchemy_ic_pm); mips_cpu_irq_init(); /* register all 64 possible IC0+IC1 irq sources as type "none". @@ -925,7 +933,7 @@ static void __init alchemy_gpic_init_irq(const struct alchemy_irqmap *dints) int i; void __iomem *bank_base; - register_syscore_ops(&alchemy_gpic_pmops); + register_syscore(&alchemy_gpic_pm); mips_cpu_irq_init(); /* disable & ack all possible interrupt sources */ diff --git a/arch/mips/alchemy/common/usb.c b/arch/mips/alchemy/common/usb.c index 5d618547ebf0..a55f32bf517c 100644 --- a/arch/mips/alchemy/common/usb.c +++ b/arch/mips/alchemy/common/usb.c @@ -580,22 +580,26 @@ static void alchemy_usb_pm(int susp) } } -static int alchemy_usb_suspend(void) +static int alchemy_usb_suspend(void *data) { alchemy_usb_pm(1); return 0; } -static void alchemy_usb_resume(void) +static void alchemy_usb_resume(void *data) { alchemy_usb_pm(0); } -static struct syscore_ops alchemy_usb_pm_ops = { +static const struct syscore_ops alchemy_usb_pm_syscore_ops = { .suspend = alchemy_usb_suspend, .resume = alchemy_usb_resume, }; +static struct syscore alchemy_usb_pm_syscore = { + .ops = &alchemy_usb_pm_syscore_ops, +}; + static int __init alchemy_usb_init(void) { int ret = 0; @@ -620,7 +624,7 @@ static int __init alchemy_usb_init(void) } if (!ret) - register_syscore_ops(&alchemy_usb_pm_ops); + register_syscore(&alchemy_usb_pm_syscore); return ret; } diff --git a/arch/mips/pci/pci-alchemy.c b/arch/mips/pci/pci-alchemy.c index 58625d1b6465..6bfee0f71803 100644 --- a/arch/mips/pci/pci-alchemy.c +++ b/arch/mips/pci/pci-alchemy.c @@ -304,7 +304,7 @@ static int alchemy_pci_def_idsel(unsigned int devsel, int assert) } /* save PCI controller register contents. */ -static int alchemy_pci_suspend(void) +static int alchemy_pci_suspend(void *data) { struct alchemy_pci_context *ctx = __alchemy_pci_ctx; if (!ctx) @@ -326,7 +326,7 @@ static int alchemy_pci_suspend(void) return 0; } -static void alchemy_pci_resume(void) +static void alchemy_pci_resume(void *data) { struct alchemy_pci_context *ctx = __alchemy_pci_ctx; if (!ctx) @@ -354,9 +354,13 @@ static void alchemy_pci_resume(void) alchemy_pci_wired_entry(ctx); /* install it */ } -static struct syscore_ops alchemy_pci_pmops = { - .suspend = alchemy_pci_suspend, - .resume = alchemy_pci_resume, +static const struct syscore_ops alchemy_pci_syscore_ops = { + .suspend = alchemy_pci_suspend, + .resume = alchemy_pci_resume, +}; + +static struct syscore alchemy_pci_syscore = { + .ops = &alchemy_pci_syscore_ops, }; static int alchemy_pci_probe(struct platform_device *pdev) @@ -478,7 +482,7 @@ static int alchemy_pci_probe(struct platform_device *pdev) __alchemy_pci_ctx = ctx; platform_set_drvdata(pdev, ctx); - register_syscore_ops(&alchemy_pci_pmops); + register_syscore(&alchemy_pci_syscore); register_pci_controller(&ctx->alchemy_pci_ctrl); dev_info(&pdev->dev, "PCI controller at %ld MHz\n", diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 2c07387201d0..2ddb93df4817 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -726,7 +726,7 @@ static inline void crash_register_spus(struct list_head *list) } #endif -static void spu_shutdown(void) +static void spu_shutdown(void *data) { struct spu *spu; @@ -738,10 +738,14 @@ static void spu_shutdown(void) mutex_unlock(&spu_full_list_mutex); } -static struct syscore_ops spu_syscore_ops = { +static const struct syscore_ops spu_syscore_ops = { .shutdown = spu_shutdown, }; +static struct syscore spu_syscore = { + .ops = &spu_syscore_ops, +}; + static int __init init_spu_base(void) { int i, ret = 0; @@ -774,7 +778,7 @@ static int __init init_spu_base(void) crash_register_spus(&spu_full_list); mutex_unlock(&spu_full_list_mutex); spu_add_dev_attr(&dev_attr_stat); - register_syscore_ops(&spu_syscore_ops); + register_syscore(&spu_syscore); spu_init_affinity(); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index c37783a03d25..1959cc13438f 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -600,7 +600,7 @@ not_found: return viaint; } -static int pmacpic_suspend(void) +static int pmacpic_suspend(void *data) { int viaint = pmacpic_find_viaint(); @@ -621,7 +621,7 @@ static int pmacpic_suspend(void) return 0; } -static void pmacpic_resume(void) +static void pmacpic_resume(void *data) { int i; @@ -634,15 +634,19 @@ static void pmacpic_resume(void) pmac_unmask_irq(irq_get_irq_data(i)); } -static struct syscore_ops pmacpic_syscore_ops = { +static const struct syscore_ops pmacpic_syscore_ops = { .suspend = pmacpic_suspend, .resume = pmacpic_resume, }; +static struct syscore pmacpic_syscore = { + .ops = &pmacpic_syscore_ops, +}; + static int __init init_pmacpic_syscore(void) { if (pmac_irq_hw[0]) - register_syscore_ops(&pmacpic_syscore_ops); + register_syscore(&pmacpic_syscore); return 0; } diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c index 217cea150987..7ed07232a69a 100644 --- a/arch/powerpc/sysdev/fsl_lbc.c +++ b/arch/powerpc/sysdev/fsl_lbc.c @@ -350,7 +350,7 @@ err: #ifdef CONFIG_SUSPEND /* save lbc registers */ -static int fsl_lbc_syscore_suspend(void) +static int fsl_lbc_syscore_suspend(void *data) { struct fsl_lbc_ctrl *ctrl; struct fsl_lbc_regs __iomem *lbc; @@ -374,7 +374,7 @@ out: } /* restore lbc registers */ -static void fsl_lbc_syscore_resume(void) +static void fsl_lbc_syscore_resume(void *data) { struct fsl_lbc_ctrl *ctrl; struct fsl_lbc_regs __iomem *lbc; @@ -408,10 +408,14 @@ static const struct of_device_id fsl_lbc_match[] = { }; #ifdef CONFIG_SUSPEND -static struct syscore_ops lbc_syscore_pm_ops = { +static const struct syscore_ops lbc_syscore_pm_ops = { .suspend = fsl_lbc_syscore_suspend, .resume = fsl_lbc_syscore_resume, }; + +static struct syscore lbc_syscore_pm = { + .ops = &lbc_syscore_pm_ops, +}; #endif static struct platform_driver fsl_lbc_ctrl_driver = { @@ -425,7 +429,7 @@ static struct platform_driver fsl_lbc_ctrl_driver = { static int __init fsl_lbc_init(void) { #ifdef CONFIG_SUSPEND - register_syscore_ops(&lbc_syscore_pm_ops); + register_syscore(&lbc_syscore_pm); #endif return platform_driver_register(&fsl_lbc_ctrl_driver); } diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index ef7707ea0db7..4e501654cb41 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1258,7 +1258,7 @@ static void fsl_pci_syscore_do_suspend(struct pci_controller *hose) send_pme_turnoff_message(hose); } -static int fsl_pci_syscore_suspend(void) +static int fsl_pci_syscore_suspend(void *data) { struct pci_controller *hose, *tmp; @@ -1291,7 +1291,7 @@ static void fsl_pci_syscore_do_resume(struct pci_controller *hose) setup_pci_atmu(hose); } -static void fsl_pci_syscore_resume(void) +static void fsl_pci_syscore_resume(void *data) { struct pci_controller *hose, *tmp; @@ -1299,10 +1299,14 @@ static void fsl_pci_syscore_resume(void) fsl_pci_syscore_do_resume(hose); } -static struct syscore_ops pci_syscore_pm_ops = { +static const struct syscore_ops pci_syscore_pm_ops = { .suspend = fsl_pci_syscore_suspend, .resume = fsl_pci_syscore_resume, }; + +static struct syscore pci_syscore_pm = { + .ops = &pci_syscore_pm_ops, +}; #endif void fsl_pcibios_fixup_phb(struct pci_controller *phb) @@ -1359,7 +1363,7 @@ static struct platform_driver fsl_pci_driver = { static int __init fsl_pci_init(void) { #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&pci_syscore_pm_ops); + register_syscore(&pci_syscore_pm); #endif return platform_driver_register(&fsl_pci_driver); } diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 70be2105865d..290ba8427239 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -817,7 +817,7 @@ static struct { u32 sercr; } ipic_saved_state; -static int ipic_suspend(void) +static int ipic_suspend(void *data) { struct ipic *ipic = primary_ipic; @@ -848,7 +848,7 @@ static int ipic_suspend(void) return 0; } -static void ipic_resume(void) +static void ipic_resume(void *data) { struct ipic *ipic = primary_ipic; @@ -870,18 +870,22 @@ static void ipic_resume(void) #define ipic_resume NULL #endif -static struct syscore_ops ipic_syscore_ops = { +static const struct syscore_ops ipic_syscore_ops = { .suspend = ipic_suspend, .resume = ipic_resume, }; +static struct syscore ipic_syscore = { + .ops = &ipic_syscore_ops, +}; + static int __init init_ipic_syscore(void) { if (!primary_ipic || !primary_ipic->regs) return -ENODEV; printk(KERN_DEBUG "Registering ipic system core operations\n"); - register_syscore_ops(&ipic_syscore_ops); + register_syscore(&ipic_syscore); return 0; } diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index ad7310bba00b..67e51998d1ae 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1944,7 +1944,7 @@ static void mpic_suspend_one(struct mpic *mpic) } } -static int mpic_suspend(void) +static int mpic_suspend(void *data) { struct mpic *mpic = mpics; @@ -1986,7 +1986,7 @@ static void mpic_resume_one(struct mpic *mpic) } /* end for loop */ } -static void mpic_resume(void) +static void mpic_resume(void *data) { struct mpic *mpic = mpics; @@ -1996,19 +1996,23 @@ static void mpic_resume(void) } } -static struct syscore_ops mpic_syscore_ops = { +static const struct syscore_ops mpic_syscore_ops = { .resume = mpic_resume, .suspend = mpic_suspend, }; +static struct syscore mpic_syscore = { + .ops = &mpic_syscore_ops, +}; + static int mpic_init_sys(void) { int rc; - register_syscore_ops(&mpic_syscore_ops); + register_syscore(&mpic_syscore); rc = subsys_system_register(&mpic_subsys, NULL); if (rc) { - unregister_syscore_ops(&mpic_syscore_ops); + unregister_syscore(&mpic_syscore); pr_err("mpic: Failed to register subsystem!\n"); return rc; } diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c index 7166e2e0baaf..60f5b3934b51 100644 --- a/arch/powerpc/sysdev/mpic_timer.c +++ b/arch/powerpc/sysdev/mpic_timer.c @@ -519,7 +519,7 @@ out: kfree(priv); } -static void mpic_timer_resume(void) +static void mpic_timer_resume(void *data) { struct timer_group_priv *priv; @@ -535,10 +535,14 @@ static const struct of_device_id mpic_timer_ids[] = { {}, }; -static struct syscore_ops mpic_timer_syscore_ops = { +static const struct syscore_ops mpic_timer_syscore_ops = { .resume = mpic_timer_resume, }; +static struct syscore mpic_timer_syscore = { + .ops = &mpic_timer_syscore_ops, +}; + static int __init mpic_timer_init(void) { struct device_node *np = NULL; @@ -546,7 +550,7 @@ static int __init mpic_timer_init(void) for_each_matching_node(np, mpic_timer_ids) timer_group_init(np); - register_syscore_ops(&mpic_timer_syscore_ops); + register_syscore(&mpic_timer_syscore); if (list_empty(&timer_group_list)) return -ENODEV; diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index 68eb7cc6e564..482eec50f404 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -857,7 +857,7 @@ static int __init pmb_debugfs_init(void) subsys_initcall(pmb_debugfs_init); #ifdef CONFIG_PM -static void pmb_syscore_resume(void) +static void pmb_syscore_resume(void *data) { struct pmb_entry *pmbe; int i; @@ -874,13 +874,17 @@ static void pmb_syscore_resume(void) read_unlock(&pmb_rwlock); } -static struct syscore_ops pmb_syscore_ops = { +static const struct syscore_ops pmb_syscore_ops = { .resume = pmb_syscore_resume, }; +static struct syscore pmb_syscore = { + .ops = &pmb_syscore_ops, +}; + static int __init pmb_sysdev_init(void) { - register_syscore_ops(&pmb_syscore_ops); + register_syscore(&pmb_syscore); return 0; } subsys_initcall(pmb_sysdev_init); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 112f43b23ebf..aca89f23d2e0 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1718,26 +1718,30 @@ static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) #ifdef CONFIG_PM -static int perf_ibs_suspend(void) +static int perf_ibs_suspend(void *data) { clear_APIC_ibs(); return 0; } -static void perf_ibs_resume(void) +static void perf_ibs_resume(void *data) { ibs_eilvt_setup(); setup_APIC_ibs(); } -static struct syscore_ops perf_ibs_syscore_ops = { +static const struct syscore_ops perf_ibs_syscore_ops = { .resume = perf_ibs_resume, .suspend = perf_ibs_suspend, }; +static struct syscore perf_ibs_syscore = { + .ops = &perf_ibs_syscore_ops, +}; + static void perf_ibs_pm_init(void) { - register_syscore_ops(&perf_ibs_syscore_ops); + register_syscore(&perf_ibs_syscore); } #else diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e890fd37e9c2..085ef4f2e73a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -351,7 +351,7 @@ static int __init hv_pci_init(void) return 1; } -static int hv_suspend(void) +static int hv_suspend(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; @@ -378,7 +378,7 @@ static int hv_suspend(void) return ret; } -static void hv_resume(void) +static void hv_resume(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; @@ -405,11 +405,15 @@ static void hv_resume(void) } /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ -static struct syscore_ops hv_syscore_ops = { +static const struct syscore_ops hv_syscore_ops = { .suspend = hv_suspend, .resume = hv_resume, }; +static struct syscore hv_syscore = { + .ops = &hv_syscore_ops, +}; + static void (* __initdata old_setup_percpu_clockev)(void); static void __init hv_stimer_setup_percpu_clockev(void) @@ -569,7 +573,7 @@ skip_hypercall_pg_init: x86_init.pci.arch_init = hv_pci_init; - register_syscore_ops(&hv_syscore_ops); + register_syscore(&hv_syscore); if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 3485d419c2f5..e6e68a31634c 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -591,7 +591,7 @@ static void gart_fixup_northbridges(void) } } -static void gart_resume(void) +static void gart_resume(void *data) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); @@ -600,11 +600,15 @@ static void gart_resume(void) enable_gart_translations(); } -static struct syscore_ops gart_syscore_ops = { +static const struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; +static struct syscore gart_syscore = { + .ops = &gart_syscore_ops, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -650,7 +654,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - register_syscore_ops(&gart_syscore_ops); + register_syscore(&gart_syscore); flush_gart(); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 680d305589a3..fd87b1562c7e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2381,7 +2381,7 @@ static struct { unsigned int apic_cmci; } apic_pm_state; -static int lapic_suspend(void) +static int lapic_suspend(void *data) { unsigned long flags; int maxlvt; @@ -2429,7 +2429,7 @@ static int lapic_suspend(void) return 0; } -static void lapic_resume(void) +static void lapic_resume(void *data) { unsigned int l, h; unsigned long flags; @@ -2504,11 +2504,15 @@ static void lapic_resume(void) * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct syscore_ops lapic_syscore_ops = { +static const struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; +static struct syscore lapic_syscore = { + .ops = &lapic_syscore_ops, +}; + static void apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2518,7 +2522,7 @@ static int __init init_lapic_sysfs(void) { /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ if (boot_cpu_has(X86_FEATURE_APIC)) - register_syscore_ops(&lapic_syscore_ops); + register_syscore(&lapic_syscore); return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5ba2feb2c04c..84e200662ce6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2308,7 +2308,12 @@ static void resume_ioapic_id(int ioapic_idx) } } -static void ioapic_resume(void) +static int ioapic_suspend(void *data) +{ + return save_ioapic_entries(); +} + +static void ioapic_resume(void *data) { int ioapic_idx; @@ -2318,14 +2323,18 @@ static void ioapic_resume(void) restore_ioapic_entries(); } -static struct syscore_ops ioapic_syscore_ops = { - .suspend = save_ioapic_entries, +static const struct syscore_ops ioapic_syscore_ops = { + .suspend = ioapic_suspend, .resume = ioapic_resume, }; +static struct syscore ioapic_syscore = { + .ops = &ioapic_syscore_ops, +}; + static int __init ioapic_init_ops(void) { - register_syscore_ops(&ioapic_syscore_ops); + register_syscore(&ioapic_syscore); return 0; } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index a315b0627dfb..7ffc78d5ebf2 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; -static void init_counter_refs(void) +static void init_counter_refs(void *data) { u64 aperf, mperf; @@ -289,16 +289,20 @@ out: } #ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { +static const struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; -static void register_freq_invariance_syscore_ops(void) +static struct syscore freq_invariance_syscore = { + .ops = &freq_invariance_syscore_ops, +}; + +static void register_freq_invariance_syscore(void) { - register_syscore_ops(&freq_invariance_syscore_ops); + register_syscore(&freq_invariance_syscore); } #else -static inline void register_freq_invariance_syscore_ops(void) {} +static inline void register_freq_invariance_syscore(void) {} #endif static void freq_invariance_enable(void) @@ -308,7 +312,7 @@ static void freq_invariance_enable(void) return; } static_branch_enable_cpuslocked(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); + register_freq_invariance_syscore(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -535,7 +539,7 @@ static int __init bp_init_aperfmperf(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; - init_counter_refs(); + init_counter_refs(NULL); bp_init_freq_invariance(); return 0; } @@ -544,5 +548,5 @@ early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - init_counter_refs(); + init_counter_refs(NULL); } diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index bc7671f920a7..2c56f8730f59 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -75,7 +75,7 @@ static u8 energ_perf_values[] = { [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, }; -static int intel_epb_save(void) +static int intel_epb_save(void *data) { u64 epb; @@ -89,7 +89,7 @@ static int intel_epb_save(void) return 0; } -static void intel_epb_restore(void) +static void intel_epb_restore(void *data) { u64 val = this_cpu_read(saved_epb); u64 epb; @@ -114,11 +114,15 @@ static void intel_epb_restore(void) wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } -static struct syscore_ops intel_epb_syscore_ops = { +static const struct syscore_ops intel_epb_syscore_ops = { .suspend = intel_epb_save, .resume = intel_epb_restore, }; +static struct syscore intel_epb_syscore = { + .ops = &intel_epb_syscore_ops, +}; + static const char * const energy_perf_strings[] = { [EPB_INDEX_PERFORMANCE] = "performance", [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", @@ -185,7 +189,7 @@ static int intel_epb_online(unsigned int cpu) { struct device *cpu_dev = get_cpu_device(cpu); - intel_epb_restore(); + intel_epb_restore(NULL); if (!cpuhp_tasks_frozen) sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); @@ -199,7 +203,7 @@ static int intel_epb_offline(unsigned int cpu) if (!cpuhp_tasks_frozen) sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); - intel_epb_save(); + intel_epb_save(NULL); return 0; } @@ -230,7 +234,7 @@ static __init int intel_epb_init(void) if (ret < 0) goto err_out_online; - register_syscore_ops(&intel_epb_syscore_ops); + register_syscore(&intel_epb_syscore); return 0; err_out_online: diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..23bfbc7dfb8e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2410,13 +2410,13 @@ static void vendor_disable_error_reporting(void) mce_disable_error_reporting(); } -static int mce_syscore_suspend(void) +static int mce_syscore_suspend(void *data) { vendor_disable_error_reporting(); return 0; } -static void mce_syscore_shutdown(void) +static void mce_syscore_shutdown(void *data) { vendor_disable_error_reporting(); } @@ -2426,7 +2426,7 @@ static void mce_syscore_shutdown(void) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static void mce_syscore_resume(void) +static void mce_syscore_resume(void *data) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); @@ -2434,12 +2434,16 @@ static void mce_syscore_resume(void) cr4_set_bits(X86_CR4_MCE); } -static struct syscore_ops mce_syscore_ops = { +static const struct syscore_ops mce_syscore_ops = { .suspend = mce_syscore_suspend, .shutdown = mce_syscore_shutdown, .resume = mce_syscore_resume, }; +static struct syscore mce_syscore = { + .ops = &mce_syscore_ops, +}; + /* * mce_device: Sysfs support */ @@ -2840,7 +2844,7 @@ static __init int mcheck_init_device(void) if (err < 0) goto err_out_online; - register_syscore_ops(&mce_syscore_ops); + register_syscore(&mce_syscore); return 0; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f75c140906d0..3262756f8c32 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -812,8 +812,17 @@ void microcode_bsp_resume(void) reload_early_microcode(cpu); } -static struct syscore_ops mc_syscore_ops = { - .resume = microcode_bsp_resume, +static void microcode_bsp_syscore_resume(void *data) +{ + microcode_bsp_resume(); +} + +static const struct syscore_ops mc_syscore_ops = { + .resume = microcode_bsp_syscore_resume, +}; + +static struct syscore mc_syscore = { + .ops = &mc_syscore_ops, }; static int mc_cpu_online(unsigned int cpu) @@ -892,7 +901,7 @@ static int __init microcode_init(void) } } - register_syscore_ops(&mc_syscore_ops); + register_syscore(&mc_syscore); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c index d25882fcf181..2415ffaaf02c 100644 --- a/arch/x86/kernel/cpu/mtrr/legacy.c +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -41,7 +41,7 @@ struct mtrr_value { static struct mtrr_value *mtrr_value; -static int mtrr_save(void) +static int mtrr_save(void *data) { int i; @@ -56,7 +56,7 @@ static int mtrr_save(void) return 0; } -static void mtrr_restore(void) +static void mtrr_restore(void *data) { int i; @@ -69,11 +69,15 @@ static void mtrr_restore(void) } } -static struct syscore_ops mtrr_syscore_ops = { +static const struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; +static struct syscore mtrr_syscore = { + .ops = &mtrr_syscore_ops, +}; + void mtrr_register_syscore(void) { mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); @@ -86,5 +90,5 @@ void mtrr_register_syscore(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - register_syscore_ops(&mtrr_syscore_ops); + register_syscore(&mtrr_syscore); } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 933fcd7ff250..e4a31c536642 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -86,15 +86,19 @@ static int umwait_cpu_offline(unsigned int cpu) * trust the firmware nor does it matter if the same value is written * again. */ -static void umwait_syscore_resume(void) +static void umwait_syscore_resume(void *data) { umwait_update_control_msr(NULL); } -static struct syscore_ops umwait_syscore_ops = { +static const struct syscore_ops umwait_syscore_ops = { .resume = umwait_syscore_resume, }; +static struct syscore umwait_syscore = { + .ops = &umwait_syscore_ops, +}; + /* sysfs interface */ /* @@ -226,7 +230,7 @@ static int __init umwait_init(void) return ret; } - register_syscore_ops(&umwait_syscore_ops); + register_syscore(&umwait_syscore); /* * Add umwait control interface. Ignore failure, so at least the diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index 2cd124ad9380..896d46b44284 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -19,7 +19,7 @@ * in asm/dma.h. */ -static void i8237A_resume(void) +static void i8237A_resume(void *data) { unsigned long flags; int i; @@ -41,10 +41,14 @@ static void i8237A_resume(void) release_dma_lock(flags); } -static struct syscore_ops i8237_syscore_ops = { +static const struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; +static struct syscore i8237_syscore = { + .ops = &i8237_syscore_ops, +}; + static int __init i8237A_init_ops(void) { /* @@ -70,7 +74,7 @@ static int __init i8237A_init_ops(void) if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017) return -ENODEV; - register_syscore_ops(&i8237_syscore_ops); + register_syscore(&i8237_syscore); return 0; } device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 2bade73f49e3..f67063df6723 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -247,19 +247,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(PIC_ELCR2) & 0xDE; } -static void i8259A_resume(void) +static void i8259A_resume(void *data) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); } -static int i8259A_suspend(void) +static int i8259A_suspend(void *data) { save_ELCR(irq_trigger); return 0; } -static void i8259A_shutdown(void) +static void i8259A_shutdown(void *data) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -269,12 +269,16 @@ static void i8259A_shutdown(void) outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } -static struct syscore_ops i8259_syscore_ops = { +static const struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; +static struct syscore i8259_syscore = { + .ops = &i8259_syscore_ops, +}; + static void mask_8259A(void) { unsigned long flags; @@ -444,7 +448,7 @@ EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { if (legacy_pic == &default_legacy_pic) - register_syscore_ops(&i8259_syscore_ops); + register_syscore(&i8259_syscore); return 0; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b67d7c59dca0..1500852ba03c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -720,7 +720,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) #endif -static int kvm_suspend(void) +static int kvm_suspend(void *data) { u64 val = 0; @@ -734,7 +734,7 @@ static int kvm_suspend(void) return 0; } -static void kvm_resume(void) +static void kvm_resume(void *data) { kvm_cpu_online(raw_smp_processor_id()); @@ -744,11 +744,15 @@ static void kvm_resume(void) #endif } -static struct syscore_ops kvm_syscore_ops = { +static const struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; +static struct syscore kvm_syscore = { + .ops = &kvm_syscore_ops, +}; + static void kvm_pv_guest_cpu_reboot(void *unused) { kvm_guest_cpu_offline(true); @@ -858,7 +862,7 @@ static void __init kvm_guest_init(void) machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - register_syscore_ops(&kvm_syscore_ops); + register_syscore(&kvm_syscore); /* * Hard lockup detection is enabled by default. Disable it, as guests diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index e4560b33b8ad..bed7dc85612e 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -761,7 +761,7 @@ static int acpi_pci_link_resume(struct acpi_pci_link *link) return 0; } -static void irqrouter_resume(void) +static void irqrouter_resume(void *data) { struct acpi_pci_link *link; @@ -888,10 +888,14 @@ static int __init acpi_irq_balance_set(char *str) __setup("acpi_irq_balance", acpi_irq_balance_set); -static struct syscore_ops irqrouter_syscore_ops = { +static const struct syscore_ops irqrouter_syscore_ops = { .resume = irqrouter_resume, }; +static struct syscore irqrouter_syscore = { + .ops = &irqrouter_syscore_ops, +}; + void __init acpi_pci_link_init(void) { if (acpi_noirq) @@ -904,6 +908,6 @@ void __init acpi_pci_link_init(void) else acpi_irq_balance = 0; } - register_syscore_ops(&irqrouter_syscore_ops); + register_syscore(&irqrouter_syscore); acpi_scan_add_handler(&pci_link_handler); } diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c8ee8e42b0f6..aaf57d0aaa19 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -884,13 +884,13 @@ bool acpi_s2idle_wakeup(void) #ifdef CONFIG_PM_SLEEP static u32 saved_bm_rld; -static int acpi_save_bm_rld(void) +static int acpi_save_bm_rld(void *data) { acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_RLD, &saved_bm_rld); return 0; } -static void acpi_restore_bm_rld(void) +static void acpi_restore_bm_rld(void *data) { u32 resumed_bm_rld = 0; @@ -901,14 +901,18 @@ static void acpi_restore_bm_rld(void) acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, saved_bm_rld); } -static struct syscore_ops acpi_sleep_syscore_ops = { +static const struct syscore_ops acpi_sleep_syscore_ops = { .suspend = acpi_save_bm_rld, .resume = acpi_restore_bm_rld, }; +static struct syscore acpi_sleep_syscore = { + .ops = &acpi_sleep_syscore_ops, +}; + static void acpi_sleep_syscore_init(void) { - register_syscore_ops(&acpi_sleep_syscore_ops); + register_syscore(&acpi_sleep_syscore); } #else static inline void acpi_sleep_syscore_init(void) {} diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 6942c62fa59d..8191dbab92c4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -1585,16 +1585,20 @@ static int fw_pm_notify(struct notifier_block *notify_block, } /* stop caching firmware once syscore_suspend is reached */ -static int fw_suspend(void) +static int fw_suspend(void *data) { fw_cache.state = FW_LOADER_NO_CACHE; return 0; } -static struct syscore_ops fw_syscore_ops = { +static const struct syscore_ops fw_syscore_ops = { .suspend = fw_suspend, }; +static struct syscore fw_syscore = { + .ops = &fw_syscore_ops, +}; + static int __init register_fw_pm_ops(void) { int ret; @@ -1610,14 +1614,14 @@ static int __init register_fw_pm_ops(void) if (ret) return ret; - register_syscore_ops(&fw_syscore_ops); + register_syscore(&fw_syscore); return ret; } static inline void unregister_fw_pm_ops(void) { - unregister_syscore_ops(&fw_syscore_ops); + unregister_syscore(&fw_syscore); unregister_pm_notifier(&fw_cache.pm_notify); } #else diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index 13db1f78d2ce..483adb796654 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -11,32 +11,32 @@ #include #include -static LIST_HEAD(syscore_ops_list); -static DEFINE_MUTEX(syscore_ops_lock); +static LIST_HEAD(syscore_list); +static DEFINE_MUTEX(syscore_lock); /** - * register_syscore_ops - Register a set of system core operations. - * @ops: System core operations to register. + * register_syscore - Register a set of system core operations. + * @syscore: System core operations to register. */ -void register_syscore_ops(struct syscore_ops *ops) +void register_syscore(struct syscore *syscore) { - mutex_lock(&syscore_ops_lock); - list_add_tail(&ops->node, &syscore_ops_list); - mutex_unlock(&syscore_ops_lock); + mutex_lock(&syscore_lock); + list_add_tail(&syscore->node, &syscore_list); + mutex_unlock(&syscore_lock); } -EXPORT_SYMBOL_GPL(register_syscore_ops); +EXPORT_SYMBOL_GPL(register_syscore); /** - * unregister_syscore_ops - Unregister a set of system core operations. - * @ops: System core operations to unregister. + * unregister_syscore - Unregister a set of system core operations. + * @syscore: System core operations to unregister. */ -void unregister_syscore_ops(struct syscore_ops *ops) +void unregister_syscore(struct syscore *syscore) { - mutex_lock(&syscore_ops_lock); - list_del(&ops->node); - mutex_unlock(&syscore_ops_lock); + mutex_lock(&syscore_lock); + list_del(&syscore->node); + mutex_unlock(&syscore_lock); } -EXPORT_SYMBOL_GPL(unregister_syscore_ops); +EXPORT_SYMBOL_GPL(unregister_syscore); #ifdef CONFIG_PM_SLEEP /** @@ -46,7 +46,7 @@ EXPORT_SYMBOL_GPL(unregister_syscore_ops); */ int syscore_suspend(void) { - struct syscore_ops *ops; + struct syscore *syscore; int ret = 0; trace_suspend_resume(TPS("syscore_suspend"), 0, true); @@ -59,25 +59,27 @@ int syscore_suspend(void) WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core suspend.\n"); - list_for_each_entry_reverse(ops, &syscore_ops_list, node) - if (ops->suspend) { - pm_pr_dbg("Calling %pS\n", ops->suspend); - ret = ops->suspend(); + list_for_each_entry_reverse(syscore, &syscore_list, node) + if (syscore->ops->suspend) { + pm_pr_dbg("Calling %pS\n", syscore->ops->suspend); + ret = syscore->ops->suspend(syscore->data); if (ret) goto err_out; WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pS\n", ops->suspend); + "Interrupts enabled after %pS\n", + syscore->ops->suspend); } trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: - pr_err("PM: System core suspend callback %pS failed.\n", ops->suspend); + pr_err("PM: System core suspend callback %pS failed.\n", + syscore->ops->suspend); - list_for_each_entry_continue(ops, &syscore_ops_list, node) - if (ops->resume) - ops->resume(); + list_for_each_entry_continue(syscore, &syscore_list, node) + if (syscore->ops->resume) + syscore->ops->resume(syscore->data); return ret; } @@ -90,18 +92,19 @@ EXPORT_SYMBOL_GPL(syscore_suspend); */ void syscore_resume(void) { - struct syscore_ops *ops; + struct syscore *syscore; trace_suspend_resume(TPS("syscore_resume"), 0, true); WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core resume.\n"); - list_for_each_entry(ops, &syscore_ops_list, node) - if (ops->resume) { - pm_pr_dbg("Calling %pS\n", ops->resume); - ops->resume(); + list_for_each_entry(syscore, &syscore_list, node) + if (syscore->ops->resume) { + pm_pr_dbg("Calling %pS\n", syscore->ops->resume); + syscore->ops->resume(syscore->data); WARN_ONCE(!irqs_disabled(), - "Interrupts enabled after %pS\n", ops->resume); + "Interrupts enabled after %pS\n", + syscore->ops->resume); } trace_suspend_resume(TPS("syscore_resume"), 0, false); } @@ -113,16 +116,17 @@ EXPORT_SYMBOL_GPL(syscore_resume); */ void syscore_shutdown(void) { - struct syscore_ops *ops; + struct syscore *syscore; - mutex_lock(&syscore_ops_lock); + mutex_lock(&syscore_lock); - list_for_each_entry_reverse(ops, &syscore_ops_list, node) - if (ops->shutdown) { + list_for_each_entry_reverse(syscore, &syscore_list, node) + if (syscore->ops->shutdown) { if (initcall_debug) - pr_info("PM: Calling %pS\n", ops->shutdown); - ops->shutdown(); + pr_info("PM: Calling %pS\n", + syscore->ops->shutdown); + syscore->ops->shutdown(syscore->data); } - mutex_unlock(&syscore_ops_lock); + mutex_unlock(&syscore_lock); } diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index 00cb792bda18..dd94145c9b22 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -1006,7 +1006,7 @@ static __init int mvebu_mbus_debugfs_init(void) } fs_initcall(mvebu_mbus_debugfs_init); -static int mvebu_mbus_suspend(void) +static int mvebu_mbus_suspend(void *data) { struct mvebu_mbus_state *s = &mbus_state; int win; @@ -1040,7 +1040,7 @@ static int mvebu_mbus_suspend(void) return 0; } -static void mvebu_mbus_resume(void) +static void mvebu_mbus_resume(void *data) { struct mvebu_mbus_state *s = &mbus_state; int win; @@ -1069,9 +1069,13 @@ static void mvebu_mbus_resume(void) } } -static struct syscore_ops mvebu_mbus_syscore_ops = { - .suspend = mvebu_mbus_suspend, - .resume = mvebu_mbus_resume, +static const struct syscore_ops mvebu_mbus_syscore_ops = { + .suspend = mvebu_mbus_suspend, + .resume = mvebu_mbus_resume, +}; + +static struct syscore mvebu_mbus_syscore = { + .ops = &mvebu_mbus_syscore_ops, }; static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus, @@ -1118,7 +1122,7 @@ static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus, writel(UNIT_SYNC_BARRIER_ALL, mbus->mbuswins_base + UNIT_SYNC_BARRIER_OFF); - register_syscore_ops(&mvebu_mbus_syscore_ops); + register_syscore(&mvebu_mbus_syscore); return 0; } diff --git a/drivers/clk/at91/pmc.c b/drivers/clk/at91/pmc.c index acf780a81589..2310f6f73162 100644 --- a/drivers/clk/at91/pmc.c +++ b/drivers/clk/at91/pmc.c @@ -115,7 +115,7 @@ struct pmc_data *pmc_data_allocate(unsigned int ncore, unsigned int nsystem, /* Address in SECURAM that say if we suspend to backup mode. */ static void __iomem *at91_pmc_backup_suspend; -static int at91_pmc_suspend(void) +static int at91_pmc_suspend(void *data) { unsigned int backup; @@ -129,7 +129,7 @@ static int at91_pmc_suspend(void) return clk_save_context(); } -static void at91_pmc_resume(void) +static void at91_pmc_resume(void *data) { unsigned int backup; @@ -143,11 +143,15 @@ static void at91_pmc_resume(void) clk_restore_context(); } -static struct syscore_ops pmc_syscore_ops = { +static const struct syscore_ops pmc_syscore_ops = { .suspend = at91_pmc_suspend, .resume = at91_pmc_resume, }; +static struct syscore pmc_syscore = { + .ops = &pmc_syscore_ops, +}; + static const struct of_device_id pmc_dt_ids[] = { { .compatible = "atmel,sama5d2-pmc" }, { .compatible = "microchip,sama7g5-pmc", }, @@ -185,7 +189,7 @@ static int __init pmc_register_ops(void) return -ENOMEM; } - register_syscore_ops(&pmc_syscore_ops); + register_syscore(&pmc_syscore); return 0; } diff --git a/drivers/clk/imx/clk-vf610.c b/drivers/clk/imx/clk-vf610.c index 9e11f1c7c397..41eb38552a9c 100644 --- a/drivers/clk/imx/clk-vf610.c +++ b/drivers/clk/imx/clk-vf610.c @@ -139,7 +139,7 @@ static struct clk * __init vf610_get_fixed_clock( return clk; }; -static int vf610_clk_suspend(void) +static int vf610_clk_suspend(void *data) { int i; @@ -156,7 +156,7 @@ static int vf610_clk_suspend(void) return 0; } -static void vf610_clk_resume(void) +static void vf610_clk_resume(void *data) { int i; @@ -171,11 +171,15 @@ static void vf610_clk_resume(void) writel_relaxed(ccgr[i], CCM_CCGRx(i)); } -static struct syscore_ops vf610_clk_syscore_ops = { +static const struct syscore_ops vf610_clk_syscore_ops = { .suspend = vf610_clk_suspend, .resume = vf610_clk_resume, }; +static struct syscore vf610_clk_syscore = { + .ops = &vf610_clk_syscore_ops, +}; + static void __init vf610_clocks_init(struct device_node *ccm_node) { struct device_node *np; @@ -462,7 +466,7 @@ static void __init vf610_clocks_init(struct device_node *ccm_node) for (i = 0; i < ARRAY_SIZE(clks_init_on); i++) clk_prepare_enable(clk[clks_init_on[i]]); - register_syscore_ops(&vf610_clk_syscore_ops); + register_syscore(&vf610_clk_syscore); /* Add the clocks to provider list */ clk_data.clks = clk; diff --git a/drivers/clk/ingenic/jz4725b-cgu.c b/drivers/clk/ingenic/jz4725b-cgu.c index 590e9c85cb25..94cee44c854f 100644 --- a/drivers/clk/ingenic/jz4725b-cgu.c +++ b/drivers/clk/ingenic/jz4725b-cgu.c @@ -268,6 +268,6 @@ static void __init jz4725b_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4725b_cgu, "ingenic,jz4725b-cgu", jz4725b_cgu_init); diff --git a/drivers/clk/ingenic/jz4740-cgu.c b/drivers/clk/ingenic/jz4740-cgu.c index 3e0a30574ebb..2def3aedc8dd 100644 --- a/drivers/clk/ingenic/jz4740-cgu.c +++ b/drivers/clk/ingenic/jz4740-cgu.c @@ -266,6 +266,6 @@ static void __init jz4740_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-cgu", jz4740_cgu_init); diff --git a/drivers/clk/ingenic/jz4755-cgu.c b/drivers/clk/ingenic/jz4755-cgu.c index f2c2d848dab7..17cf5dcaece9 100644 --- a/drivers/clk/ingenic/jz4755-cgu.c +++ b/drivers/clk/ingenic/jz4755-cgu.c @@ -337,7 +337,7 @@ static void __init jz4755_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c index e407f00bd594..372fe4b07992 100644 --- a/drivers/clk/ingenic/jz4760-cgu.c +++ b/drivers/clk/ingenic/jz4760-cgu.c @@ -436,7 +436,7 @@ static void __init jz4760_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* We only probe via devicetree, no need for a platform driver */ diff --git a/drivers/clk/ingenic/jz4770-cgu.c b/drivers/clk/ingenic/jz4770-cgu.c index 6ae1740367f9..58f1d3bad677 100644 --- a/drivers/clk/ingenic/jz4770-cgu.c +++ b/drivers/clk/ingenic/jz4770-cgu.c @@ -456,7 +456,7 @@ static void __init jz4770_cgu_init(struct device_node *np) if (retval) pr_err("%s: failed to register CGU Clocks\n", __func__); - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* We only probe via devicetree, no need for a platform driver */ diff --git a/drivers/clk/ingenic/jz4780-cgu.c b/drivers/clk/ingenic/jz4780-cgu.c index 07e2f3c5c454..1e88aef7ac0f 100644 --- a/drivers/clk/ingenic/jz4780-cgu.c +++ b/drivers/clk/ingenic/jz4780-cgu.c @@ -803,6 +803,6 @@ static void __init jz4780_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } CLK_OF_DECLARE_DRIVER(jz4780_cgu, "ingenic,jz4780-cgu", jz4780_cgu_init); diff --git a/drivers/clk/ingenic/pm.c b/drivers/clk/ingenic/pm.c index 341752b640d2..206d5cf2872f 100644 --- a/drivers/clk/ingenic/pm.c +++ b/drivers/clk/ingenic/pm.c @@ -15,7 +15,7 @@ static void __iomem * __maybe_unused ingenic_cgu_base; -static int __maybe_unused ingenic_cgu_pm_suspend(void) +static int __maybe_unused ingenic_cgu_pm_suspend(void *data) { u32 val = readl(ingenic_cgu_base + CGU_REG_LCR); @@ -24,22 +24,26 @@ static int __maybe_unused ingenic_cgu_pm_suspend(void) return 0; } -static void __maybe_unused ingenic_cgu_pm_resume(void) +static void __maybe_unused ingenic_cgu_pm_resume(void *data) { u32 val = readl(ingenic_cgu_base + CGU_REG_LCR); writel(val & ~LCR_LOW_POWER_MODE, ingenic_cgu_base + CGU_REG_LCR); } -static struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = { +static const struct syscore_ops __maybe_unused ingenic_cgu_pm_ops = { .suspend = ingenic_cgu_pm_suspend, .resume = ingenic_cgu_pm_resume, }; -void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu) +static struct syscore __maybe_unused ingenic_cgu_pm = { + .ops = &ingenic_cgu_pm_ops, +}; + +void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu) { if (IS_ENABLED(CONFIG_PM_SLEEP)) { ingenic_cgu_base = cgu->base; - register_syscore_ops(&ingenic_cgu_pm_ops); + register_syscore(&ingenic_cgu_pm); } } diff --git a/drivers/clk/ingenic/pm.h b/drivers/clk/ingenic/pm.h index fa7540407b6b..0dcb57dc64cb 100644 --- a/drivers/clk/ingenic/pm.h +++ b/drivers/clk/ingenic/pm.h @@ -7,6 +7,6 @@ struct ingenic_cgu; -void ingenic_cgu_register_syscore_ops(struct ingenic_cgu *cgu); +void ingenic_cgu_register_syscore(struct ingenic_cgu *cgu); #endif /* DRIVERS_CLK_INGENIC_PM_H */ diff --git a/drivers/clk/ingenic/tcu.c b/drivers/clk/ingenic/tcu.c index 7d04ef40b7cf..bc6a51da2072 100644 --- a/drivers/clk/ingenic/tcu.c +++ b/drivers/clk/ingenic/tcu.c @@ -455,7 +455,7 @@ err_free_tcu: return ret; } -static int __maybe_unused tcu_pm_suspend(void) +static int __maybe_unused tcu_pm_suspend(void *data) { struct ingenic_tcu *tcu = ingenic_tcu; @@ -465,7 +465,7 @@ static int __maybe_unused tcu_pm_suspend(void) return 0; } -static void __maybe_unused tcu_pm_resume(void) +static void __maybe_unused tcu_pm_resume(void *data) { struct ingenic_tcu *tcu = ingenic_tcu; @@ -473,11 +473,15 @@ static void __maybe_unused tcu_pm_resume(void) clk_enable(tcu->clk); } -static struct syscore_ops __maybe_unused tcu_pm_ops = { +static const struct syscore_ops __maybe_unused tcu_pm_ops = { .suspend = tcu_pm_suspend, .resume = tcu_pm_resume, }; +static struct syscore __maybe_unused tcu_pm = { + .ops = &tcu_pm_ops, +}; + static void __init ingenic_tcu_init(struct device_node *np) { int ret = ingenic_tcu_probe(np); @@ -486,7 +490,7 @@ static void __init ingenic_tcu_init(struct device_node *np) pr_crit("Failed to initialize TCU clocks: %d\n", ret); if (IS_ENABLED(CONFIG_PM_SLEEP)) - register_syscore_ops(&tcu_pm_ops); + register_syscore(&tcu_pm); } CLK_OF_DECLARE_DRIVER(jz4740_cgu, "ingenic,jz4740-tcu", ingenic_tcu_init); diff --git a/drivers/clk/ingenic/x1000-cgu.c b/drivers/clk/ingenic/x1000-cgu.c index d80886caf393..d89bdfb7c219 100644 --- a/drivers/clk/ingenic/x1000-cgu.c +++ b/drivers/clk/ingenic/x1000-cgu.c @@ -556,7 +556,7 @@ static void __init x1000_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/ingenic/x1830-cgu.c b/drivers/clk/ingenic/x1830-cgu.c index 0fd46e50a513..acf856e5009e 100644 --- a/drivers/clk/ingenic/x1830-cgu.c +++ b/drivers/clk/ingenic/x1830-cgu.c @@ -463,7 +463,7 @@ static void __init x1830_cgu_init(struct device_node *np) return; } - ingenic_cgu_register_syscore_ops(cgu); + ingenic_cgu_register_syscore(cgu); } /* * CGU has some children devices, this is useful for probing children devices diff --git a/drivers/clk/mvebu/common.c b/drivers/clk/mvebu/common.c index 785dbede4835..5adbbd91a6db 100644 --- a/drivers/clk/mvebu/common.c +++ b/drivers/clk/mvebu/common.c @@ -215,22 +215,26 @@ static struct clk *clk_gating_get_src( return ERR_PTR(-ENODEV); } -static int mvebu_clk_gating_suspend(void) +static int mvebu_clk_gating_suspend(void *data) { ctrl->saved_reg = readl(ctrl->base); return 0; } -static void mvebu_clk_gating_resume(void) +static void mvebu_clk_gating_resume(void *data) { writel(ctrl->saved_reg, ctrl->base); } -static struct syscore_ops clk_gate_syscore_ops = { +static const struct syscore_ops clk_gate_syscore_ops = { .suspend = mvebu_clk_gating_suspend, .resume = mvebu_clk_gating_resume, }; +static struct syscore clk_gate_syscore = { + .ops = &clk_gate_syscore_ops, +}; + void __init mvebu_clk_gating_setup(struct device_node *np, const struct clk_gating_soc_desc *desc) { @@ -284,7 +288,7 @@ void __init mvebu_clk_gating_setup(struct device_node *np, of_clk_add_provider(np, clk_gating_get_src, ctrl); - register_syscore_ops(&clk_gate_syscore_ops); + register_syscore(&clk_gate_syscore); return; gates_out: diff --git a/drivers/clk/rockchip/clk-rk3288.c b/drivers/clk/rockchip/clk-rk3288.c index 0a1e017df7c6..9cf3e1e43b78 100644 --- a/drivers/clk/rockchip/clk-rk3288.c +++ b/drivers/clk/rockchip/clk-rk3288.c @@ -871,7 +871,7 @@ static const int rk3288_saved_cru_reg_ids[] = { static u32 rk3288_saved_cru_regs[ARRAY_SIZE(rk3288_saved_cru_reg_ids)]; -static int rk3288_clk_suspend(void) +static int rk3288_clk_suspend(void *data) { int i, reg_id; @@ -906,7 +906,7 @@ static int rk3288_clk_suspend(void) return 0; } -static void rk3288_clk_resume(void) +static void rk3288_clk_resume(void *data) { int i, reg_id; @@ -923,11 +923,15 @@ static void rk3288_clk_shutdown(void) writel_relaxed(0xf3030000, rk3288_cru_base + RK3288_MODE_CON); } -static struct syscore_ops rk3288_clk_syscore_ops = { +static const struct syscore_ops rk3288_clk_syscore_ops = { .suspend = rk3288_clk_suspend, .resume = rk3288_clk_resume, }; +static struct syscore rk3288_clk_syscore = { + .ops = &rk3288_clk_syscore_ops, +}; + static void __init rk3288_common_init(struct device_node *np, enum rk3288_variant soc) { @@ -976,7 +980,7 @@ static void __init rk3288_common_init(struct device_node *np, rockchip_register_restart_notifier(ctx, RK3288_GLB_SRST_FST, rk3288_clk_shutdown); - register_syscore_ops(&rk3288_clk_syscore_ops); + register_syscore(&rk3288_clk_syscore); rockchip_clk_of_add_provider(np, ctx); } diff --git a/drivers/clk/samsung/clk-s5pv210-audss.c b/drivers/clk/samsung/clk-s5pv210-audss.c index b1fd8fac3a4c..c9fcb23de183 100644 --- a/drivers/clk/samsung/clk-s5pv210-audss.c +++ b/drivers/clk/samsung/clk-s5pv210-audss.c @@ -36,7 +36,7 @@ static unsigned long reg_save[][2] = { {ASS_CLK_GATE, 0}, }; -static int s5pv210_audss_clk_suspend(void) +static int s5pv210_audss_clk_suspend(void *data) { int i; @@ -46,7 +46,7 @@ static int s5pv210_audss_clk_suspend(void) return 0; } -static void s5pv210_audss_clk_resume(void) +static void s5pv210_audss_clk_resume(void *data) { int i; @@ -54,10 +54,14 @@ static void s5pv210_audss_clk_resume(void) writel(reg_save[i][1], reg_base + reg_save[i][0]); } -static struct syscore_ops s5pv210_audss_clk_syscore_ops = { +static const struct syscore_ops s5pv210_audss_clk_syscore_ops = { .suspend = s5pv210_audss_clk_suspend, .resume = s5pv210_audss_clk_resume, }; + +static struct syscore s5pv210_audss_clk_syscore = { + .ops = &s5pv210_audss_clk_syscore_ops, +}; #endif /* CONFIG_PM_SLEEP */ /* register s5pv210_audss clocks */ @@ -175,7 +179,7 @@ static int s5pv210_audss_clk_probe(struct platform_device *pdev) } #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&s5pv210_audss_clk_syscore_ops); + register_syscore(&s5pv210_audss_clk_syscore); #endif return 0; diff --git a/drivers/clk/samsung/clk.c b/drivers/clk/samsung/clk.c index dbc9925ca8f4..c149ca6c2217 100644 --- a/drivers/clk/samsung/clk.c +++ b/drivers/clk/samsung/clk.c @@ -271,7 +271,7 @@ void __init samsung_clk_of_register_fixed_ext(struct samsung_clk_provider *ctx, } #ifdef CONFIG_PM_SLEEP -static int samsung_clk_suspend(void) +static int samsung_clk_suspend(void *data) { struct samsung_clock_reg_cache *reg_cache; @@ -284,7 +284,7 @@ static int samsung_clk_suspend(void) return 0; } -static void samsung_clk_resume(void) +static void samsung_clk_resume(void *data) { struct samsung_clock_reg_cache *reg_cache; @@ -293,11 +293,15 @@ static void samsung_clk_resume(void) reg_cache->rd_num); } -static struct syscore_ops samsung_clk_syscore_ops = { +static const struct syscore_ops samsung_clk_syscore_ops = { .suspend = samsung_clk_suspend, .resume = samsung_clk_resume, }; +static struct syscore samsung_clk_syscore = { + .ops = &samsung_clk_syscore_ops, +}; + void samsung_clk_extended_sleep_init(void __iomem *reg_base, const unsigned long *rdump, unsigned long nr_rdump, @@ -316,7 +320,7 @@ void samsung_clk_extended_sleep_init(void __iomem *reg_base, panic("could not allocate register dump storage.\n"); if (list_empty(&clock_reg_cache_list)) - register_syscore_ops(&samsung_clk_syscore_ops); + register_syscore(&samsung_clk_syscore); reg_cache->reg_base = reg_base; reg_cache->rd_num = nr_rdump; diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c index 412902f573b5..504d0ea997a5 100644 --- a/drivers/clk/tegra/clk-tegra210.c +++ b/drivers/clk/tegra/clk-tegra210.c @@ -3444,7 +3444,7 @@ static void tegra210_disable_cpu_clock(u32 cpu) static u32 spare_reg_ctx, misc_clk_enb_ctx, clk_msk_arm_ctx; static u32 cpu_softrst_ctx[3]; -static int tegra210_clk_suspend(void) +static int tegra210_clk_suspend(void *data) { unsigned int i; @@ -3465,7 +3465,7 @@ static int tegra210_clk_suspend(void) return 0; } -static void tegra210_clk_resume(void) +static void tegra210_clk_resume(void *data) { unsigned int i; @@ -3523,13 +3523,17 @@ static void tegra210_cpu_clock_resume(void) } #endif -static struct syscore_ops tegra_clk_syscore_ops = { +static const struct syscore_ops tegra_clk_syscore_ops = { #ifdef CONFIG_PM_SLEEP .suspend = tegra210_clk_suspend, .resume = tegra210_clk_resume, #endif }; +static struct syscore tegra_clk_syscore = { + .ops = &tegra_clk_syscore_ops, +}; + static struct tegra_cpu_car_ops tegra210_cpu_car_ops = { .wait_for_reset = tegra210_wait_cpu_in_reset, .disable_clock = tegra210_disable_cpu_clock, @@ -3813,6 +3817,6 @@ static void __init tegra210_clock_init(struct device_node *np) tegra_cpu_car_ops = &tegra210_cpu_car_ops; - register_syscore_ops(&tegra_clk_syscore_ops); + register_syscore(&tegra_clk_syscore); } CLK_OF_DECLARE(tegra210, "nvidia,tegra210-car", tegra210_clock_init); diff --git a/drivers/clocksource/timer-armada-370-xp.c b/drivers/clocksource/timer-armada-370-xp.c index 54284c1c0651..f2b4cc40db93 100644 --- a/drivers/clocksource/timer-armada-370-xp.c +++ b/drivers/clocksource/timer-armada-370-xp.c @@ -207,14 +207,14 @@ static int armada_370_xp_timer_dying_cpu(unsigned int cpu) static u32 timer0_ctrl_reg, timer0_local_ctrl_reg; -static int armada_370_xp_timer_suspend(void) +static int armada_370_xp_timer_suspend(void *data) { timer0_ctrl_reg = readl(timer_base + TIMER_CTRL_OFF); timer0_local_ctrl_reg = readl(local_base + TIMER_CTRL_OFF); return 0; } -static void armada_370_xp_timer_resume(void) +static void armada_370_xp_timer_resume(void *data) { writel(0xffffffff, timer_base + TIMER0_VAL_OFF); writel(0xffffffff, timer_base + TIMER0_RELOAD_OFF); @@ -222,11 +222,15 @@ static void armada_370_xp_timer_resume(void) writel(timer0_local_ctrl_reg, local_base + TIMER_CTRL_OFF); } -static struct syscore_ops armada_370_xp_timer_syscore_ops = { +static const struct syscore_ops armada_370_xp_timer_syscore_ops = { .suspend = armada_370_xp_timer_suspend, .resume = armada_370_xp_timer_resume, }; +static struct syscore armada_370_xp_timer_syscore = { + .ops = &armada_370_xp_timer_syscore_ops, +}; + static unsigned long armada_370_delay_timer_read(void) { return ~readl(timer_base + TIMER0_VAL_OFF); @@ -324,7 +328,7 @@ static int __init armada_370_xp_timer_common_init(struct device_node *np) return res; } - register_syscore_ops(&armada_370_xp_timer_syscore_ops); + register_syscore(&armada_370_xp_timer_syscore); return 0; } diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index b19bc60cc627..3372e1f90561 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -177,26 +177,30 @@ static void psci_idle_syscore_switch(bool suspend) } } -static int psci_idle_syscore_suspend(void) +static int psci_idle_syscore_suspend(void *data) { psci_idle_syscore_switch(true); return 0; } -static void psci_idle_syscore_resume(void) +static void psci_idle_syscore_resume(void *data) { psci_idle_syscore_switch(false); } -static struct syscore_ops psci_idle_syscore_ops = { +static const struct syscore_ops psci_idle_syscore_ops = { .suspend = psci_idle_syscore_suspend, .resume = psci_idle_syscore_resume, }; +static struct syscore psci_idle_syscore = { + .ops = &psci_idle_syscore_ops, +}; + static void psci_idle_init_syscore(void) { if (psci_cpuidle_use_syscore) - register_syscore_ops(&psci_idle_syscore_ops); + register_syscore(&psci_idle_syscore); } static void psci_idle_init_cpuhp(void) diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 52060b3ec745..d7666fe9dbf8 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -667,7 +667,7 @@ static const struct dev_pm_ops mxc_gpio_dev_pm_ops = { RUNTIME_PM_OPS(mxc_gpio_runtime_suspend, mxc_gpio_runtime_resume, NULL) }; -static int mxc_gpio_syscore_suspend(void) +static int mxc_gpio_syscore_suspend(void *data) { struct mxc_gpio_port *port; int ret; @@ -684,7 +684,7 @@ static int mxc_gpio_syscore_suspend(void) return 0; } -static void mxc_gpio_syscore_resume(void) +static void mxc_gpio_syscore_resume(void *data) { struct mxc_gpio_port *port; int ret; @@ -701,11 +701,15 @@ static void mxc_gpio_syscore_resume(void) } } -static struct syscore_ops mxc_gpio_syscore_ops = { +static const struct syscore_ops mxc_gpio_syscore_ops = { .suspend = mxc_gpio_syscore_suspend, .resume = mxc_gpio_syscore_resume, }; +static struct syscore mxc_gpio_syscore = { + .ops = &mxc_gpio_syscore_ops, +}; + static struct platform_driver mxc_gpio_driver = { .driver = { .name = "gpio-mxc", @@ -718,7 +722,7 @@ static struct platform_driver mxc_gpio_driver = { static int __init gpio_mxc_init(void) { - register_syscore_ops(&mxc_gpio_syscore_ops); + register_syscore(&mxc_gpio_syscore); return platform_driver_register(&mxc_gpio_driver); } diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c index fa22f3faa163..664cf1eef494 100644 --- a/drivers/gpio/gpio-pxa.c +++ b/drivers/gpio/gpio-pxa.c @@ -747,7 +747,7 @@ static int __init pxa_gpio_dt_init(void) device_initcall(pxa_gpio_dt_init); #ifdef CONFIG_PM -static int pxa_gpio_suspend(void) +static int pxa_gpio_suspend(void *data) { struct pxa_gpio_chip *pchip = pxa_gpio_chip; struct pxa_gpio_bank *c; @@ -768,7 +768,7 @@ static int pxa_gpio_suspend(void) return 0; } -static void pxa_gpio_resume(void) +static void pxa_gpio_resume(void *data) { struct pxa_gpio_chip *pchip = pxa_gpio_chip; struct pxa_gpio_bank *c; @@ -792,14 +792,18 @@ static void pxa_gpio_resume(void) #define pxa_gpio_resume NULL #endif -static struct syscore_ops pxa_gpio_syscore_ops = { +static const struct syscore_ops pxa_gpio_syscore_ops = { .suspend = pxa_gpio_suspend, .resume = pxa_gpio_resume, }; +static struct syscore pxa_gpio_syscore = { + .ops = &pxa_gpio_syscore_ops, +}; + static int __init pxa_gpio_sysinit(void) { - register_syscore_ops(&pxa_gpio_syscore_ops); + register_syscore(&pxa_gpio_syscore); return 0; } postcore_initcall(pxa_gpio_sysinit); diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c index 7f6a62f5d1ee..1938ffa2f4f3 100644 --- a/drivers/gpio/gpio-sa1100.c +++ b/drivers/gpio/gpio-sa1100.c @@ -256,7 +256,7 @@ static void sa1100_gpio_handler(struct irq_desc *desc) } while (mask); } -static int sa1100_gpio_suspend(void) +static int sa1100_gpio_suspend(void *data) { struct sa1100_gpio_chip *sgc = &sa1100_gpio_chip; @@ -275,19 +275,23 @@ static int sa1100_gpio_suspend(void) return 0; } -static void sa1100_gpio_resume(void) +static void sa1100_gpio_resume(void *data) { sa1100_update_edge_regs(&sa1100_gpio_chip); } -static struct syscore_ops sa1100_gpio_syscore_ops = { +static const struct syscore_ops sa1100_gpio_syscore_ops = { .suspend = sa1100_gpio_suspend, .resume = sa1100_gpio_resume, }; +static struct syscore sa1100_gpio_syscore = { + .ops = &sa1100_gpio_syscore_ops, +}; + static int __init sa1100_gpio_init_devicefs(void) { - register_syscore_ops(&sa1100_gpio_syscore_ops); + register_syscore(&sa1100_gpio_syscore); return 0; } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..67734dc73e16 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2801,7 +2801,7 @@ static void hv_crash_handler(struct pt_regs *regs) hv_synic_disable_regs(cpu); }; -static int hv_synic_suspend(void) +static int hv_synic_suspend(void *data) { /* * When we reach here, all the non-boot CPUs have been offlined. @@ -2828,7 +2828,7 @@ static int hv_synic_suspend(void) return 0; } -static void hv_synic_resume(void) +static void hv_synic_resume(void *data) { hv_synic_enable_regs(0); @@ -2840,11 +2840,15 @@ static void hv_synic_resume(void) } /* The callbacks run only on CPU0, with irqs_disabled. */ -static struct syscore_ops hv_synic_syscore_ops = { +static const struct syscore_ops hv_synic_syscore_ops = { .suspend = hv_synic_suspend, .resume = hv_synic_resume, }; +static struct syscore hv_synic_syscore = { + .ops = &hv_synic_syscore_ops, +}; + static int __init hv_acpi_init(void) { int ret; @@ -2887,7 +2891,7 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); - register_syscore_ops(&hv_synic_syscore_ops); + register_syscore(&hv_synic_syscore); return 0; @@ -2901,7 +2905,7 @@ static void __exit vmbus_exit(void) { int cpu; - unregister_syscore_ops(&hv_synic_syscore_ops); + unregister_syscore(&hv_synic_syscore); hv_remove_kexec_handler(); hv_remove_crash_handler(); diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f2991c11867c..b763f4c9c5a7 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3024,7 +3024,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static void amd_iommu_resume(void) +static void amd_iommu_resume(void *data) { struct amd_iommu *iommu; @@ -3038,7 +3038,7 @@ static void amd_iommu_resume(void) amd_iommu_enable_interrupts(); } -static int amd_iommu_suspend(void) +static int amd_iommu_suspend(void *data) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -3046,11 +3046,15 @@ static int amd_iommu_suspend(void) return 0; } -static struct syscore_ops amd_iommu_syscore_ops = { +static const struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; +static struct syscore amd_iommu_syscore = { + .ops = &amd_iommu_syscore_ops, +}; + static void __init free_iommu_resources(void) { free_iommu_all(); @@ -3395,7 +3399,7 @@ static int __init state_next(void) init_state = IOMMU_ENABLED; break; case IOMMU_ENABLED: - register_syscore_ops(&amd_iommu_syscore_ops); + register_syscore(&amd_iommu_syscore); iommu_snp_enable(); ret = amd_iommu_init_pci(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT; @@ -3498,12 +3502,12 @@ int __init amd_iommu_enable(void) void amd_iommu_disable(void) { - amd_iommu_suspend(); + amd_iommu_suspend(NULL); } int amd_iommu_reenable(int mode) { - amd_iommu_resume(); + amd_iommu_resume(NULL); return 0; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e236c7ec221f..fdaf7f64dd33 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2303,7 +2303,7 @@ static void iommu_flush_all(void) } } -static int iommu_suspend(void) +static int iommu_suspend(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2330,7 +2330,7 @@ static int iommu_suspend(void) return 0; } -static void iommu_resume(void) +static void iommu_resume(void *data) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -2361,14 +2361,18 @@ static void iommu_resume(void) } } -static struct syscore_ops iommu_syscore_ops = { +static const struct syscore_ops iommu_syscore_ops = { .resume = iommu_resume, .suspend = iommu_suspend, }; +static struct syscore iommu_syscore = { + .ops = &iommu_syscore_ops, +}; + static void __init init_iommu_pm_ops(void) { - register_syscore_ops(&iommu_syscore_ops); + register_syscore(&iommu_syscore); } #else diff --git a/drivers/irqchip/exynos-combiner.c b/drivers/irqchip/exynos-combiner.c index e7dfcf0cda43..495848442b35 100644 --- a/drivers/irqchip/exynos-combiner.c +++ b/drivers/irqchip/exynos-combiner.c @@ -200,12 +200,13 @@ static void __init combiner_init(void __iomem *combiner_base, /** * combiner_suspend - save interrupt combiner state before suspend + * @data: syscore context * * Save the interrupt enable set register for all combiner groups since * the state is lost when the system enters into a sleep state. * */ -static int combiner_suspend(void) +static int combiner_suspend(void *data) { int i; @@ -218,12 +219,13 @@ static int combiner_suspend(void) /** * combiner_resume - restore interrupt combiner state after resume + * @data: syscore context * * Restore the interrupt enable set register for all combiner groups since * the state is lost when the system enters into a sleep state on suspend. * */ -static void combiner_resume(void) +static void combiner_resume(void *data) { int i; @@ -240,11 +242,15 @@ static void combiner_resume(void) #define combiner_resume NULL #endif -static struct syscore_ops combiner_syscore_ops = { +static const struct syscore_ops combiner_syscore_ops = { .suspend = combiner_suspend, .resume = combiner_resume, }; +static struct syscore combiner_syscore = { + .ops = &combiner_syscore_ops, +}; + static int __init combiner_of_init(struct device_node *np, struct device_node *parent) { @@ -264,7 +270,7 @@ static int __init combiner_of_init(struct device_node *np, combiner_init(combiner_base, np); - register_syscore_ops(&combiner_syscore_ops); + register_syscore(&combiner_syscore); return 0; } diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c index a44c49e985b7..a4d03a2d1569 100644 --- a/drivers/irqchip/irq-armada-370-xp.c +++ b/drivers/irqchip/irq-armada-370-xp.c @@ -726,7 +726,7 @@ static void __exception_irq_entry mpic_handle_irq(struct pt_regs *regs) } while (1); } -static int mpic_suspend(void) +static int mpic_suspend(void *data) { struct mpic *mpic = mpic_data; @@ -735,7 +735,7 @@ static int mpic_suspend(void) return 0; } -static void mpic_resume(void) +static void mpic_resume(void *data) { struct mpic *mpic = mpic_data; bool src0, src1; @@ -788,11 +788,15 @@ static void mpic_resume(void) mpic_ipi_resume(mpic); } -static struct syscore_ops mpic_syscore_ops = { +static const struct syscore_ops mpic_syscore_ops = { .suspend = mpic_suspend, .resume = mpic_resume, }; +static struct syscore mpic_syscore = { + .ops = &mpic_syscore_ops, +}; + static int __init mpic_map_region(struct device_node *np, int index, void __iomem **base, phys_addr_t *phys_base) { @@ -905,7 +909,7 @@ static int __init mpic_of_init(struct device_node *node, struct device_node *par mpic_handle_cascade_irq, mpic); } - register_syscore_ops(&mpic_syscore_ops); + register_syscore(&mpic_syscore); return 0; } diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index 04fac0cc857f..cd9a56459f99 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -292,7 +292,7 @@ static int __init bcm7038_l1_init_one(struct device_node *dn, static LIST_HEAD(bcm7038_l1_intcs_list); static DEFINE_RAW_SPINLOCK(bcm7038_l1_intcs_lock); -static int bcm7038_l1_suspend(void) +static int bcm7038_l1_suspend(void *data) { struct bcm7038_l1_chip *intc; int boot_cpu, word; @@ -318,7 +318,7 @@ static int bcm7038_l1_suspend(void) return 0; } -static void bcm7038_l1_resume(void) +static void bcm7038_l1_resume(void *data) { struct bcm7038_l1_chip *intc; int boot_cpu, word; @@ -339,11 +339,15 @@ static void bcm7038_l1_resume(void) } } -static struct syscore_ops bcm7038_l1_syscore_ops = { +static const struct syscore_ops bcm7038_l1_syscore_ops = { .suspend = bcm7038_l1_suspend, .resume = bcm7038_l1_resume, }; +static struct syscore bcm7038_l1_syscore = { + .ops = &bcm7038_l1_syscore_ops, +}; + static int bcm7038_l1_set_wake(struct irq_data *d, unsigned int on) { struct bcm7038_l1_chip *intc = irq_data_get_irq_chip_data(d); @@ -431,7 +435,7 @@ static int __init bcm7038_l1_of_init(struct device_node *dn, raw_spin_unlock(&bcm7038_l1_intcs_lock); if (list_is_singular(&bcm7038_l1_intcs_list)) - register_syscore_ops(&bcm7038_l1_syscore_ops); + register_syscore(&bcm7038_l1_syscore); #endif pr_info("registered BCM7038 L1 intc (%pOF, IRQs: %d)\n", diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 467cb78435a9..ada585bfa451 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4992,7 +4992,7 @@ static void its_enable_quirks(struct its_node *its) its_quirks, its); } -static int its_save_disable(void) +static int its_save_disable(void *data) { struct its_node *its; int err = 0; @@ -5028,7 +5028,7 @@ err: return err; } -static void its_restore_enable(void) +static void its_restore_enable(void *data) { struct its_node *its; int ret; @@ -5088,11 +5088,15 @@ static void its_restore_enable(void) raw_spin_unlock(&its_lock); } -static struct syscore_ops its_syscore_ops = { +static const struct syscore_ops its_syscore_ops = { .suspend = its_save_disable, .resume = its_restore_enable, }; +static struct syscore its_syscore = { + .ops = &its_syscore_ops, +}; + static void __init __iomem *its_map_one(struct resource *res, int *err) { void __iomem *its_base; @@ -5864,7 +5868,7 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, } } - register_syscore_ops(&its_syscore_ops); + register_syscore(&its_syscore); return 0; } diff --git a/drivers/irqchip/irq-i8259.c b/drivers/irqchip/irq-i8259.c index 91b2f587119c..cca77f9948a3 100644 --- a/drivers/irqchip/irq-i8259.c +++ b/drivers/irqchip/irq-i8259.c @@ -202,13 +202,13 @@ spurious_8259A_irq: } } -static void i8259A_resume(void) +static void i8259A_resume(void *data) { if (i8259A_auto_eoi >= 0) init_8259A(i8259A_auto_eoi); } -static void i8259A_shutdown(void) +static void i8259A_shutdown(void *data) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -220,11 +220,15 @@ static void i8259A_shutdown(void) } } -static struct syscore_ops i8259_syscore_ops = { +static const struct syscore_ops i8259_syscore_ops = { .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; +static struct syscore i8259_syscore = { + .ops = &i8259_syscore_ops, +}; + static void init_8259A(int auto_eoi) { unsigned long flags; @@ -320,7 +324,7 @@ struct irq_domain * __init __init_i8259_irqs(struct device_node *node) if (request_irq(irq, no_action, IRQF_NO_THREAD, "cascade", NULL)) pr_err("Failed to register cascade interrupt\n"); - register_syscore_ops(&i8259_syscore_ops); + register_syscore(&i8259_syscore); return domain; } diff --git a/drivers/irqchip/irq-imx-gpcv2.c b/drivers/irqchip/irq-imx-gpcv2.c index b91f5c14b405..04f7ba0657be 100644 --- a/drivers/irqchip/irq-imx-gpcv2.c +++ b/drivers/irqchip/irq-imx-gpcv2.c @@ -33,7 +33,7 @@ static void __iomem *gpcv2_idx_to_reg(struct gpcv2_irqchip_data *cd, int i) return cd->gpc_base + cd->cpu2wakeup + i * 4; } -static int gpcv2_wakeup_source_save(void) +static int gpcv2_wakeup_source_save(void *data) { struct gpcv2_irqchip_data *cd; void __iomem *reg; @@ -52,7 +52,7 @@ static int gpcv2_wakeup_source_save(void) return 0; } -static void gpcv2_wakeup_source_restore(void) +static void gpcv2_wakeup_source_restore(void *data) { struct gpcv2_irqchip_data *cd; int i; @@ -65,9 +65,13 @@ static void gpcv2_wakeup_source_restore(void) writel_relaxed(cd->saved_irq_mask[i], gpcv2_idx_to_reg(cd, i)); } -static struct syscore_ops imx_gpcv2_syscore_ops = { - .suspend = gpcv2_wakeup_source_save, - .resume = gpcv2_wakeup_source_restore, +static const struct syscore_ops gpcv2_syscore_ops = { + .suspend = gpcv2_wakeup_source_save, + .resume = gpcv2_wakeup_source_restore, +}; + +static struct syscore gpcv2_syscore = { + .ops = &gpcv2_syscore_ops, }; static int imx_gpcv2_irq_set_wake(struct irq_data *d, unsigned int on) @@ -276,7 +280,7 @@ static int __init imx_gpcv2_irqchip_init(struct device_node *node, writel_relaxed(~0x1, cd->gpc_base + cd->cpu2wakeup); imx_gpcv2_instance = cd; - register_syscore_ops(&imx_gpcv2_syscore_ops); + register_syscore(&gpcv2_syscore); /* * Clear the OF_POPULATED flag set in of_irq_init so that diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c index 39e5a72ccd3c..ad2105685b48 100644 --- a/drivers/irqchip/irq-loongson-eiointc.c +++ b/drivers/irqchip/irq-loongson-eiointc.c @@ -407,21 +407,25 @@ static struct irq_domain *acpi_get_vec_parent(int node, struct acpi_vector_group return NULL; } -static int eiointc_suspend(void) +static int eiointc_suspend(void *data) { return 0; } -static void eiointc_resume(void) +static void eiointc_resume(void *data) { eiointc_router_init(0); } -static struct syscore_ops eiointc_syscore_ops = { +static const struct syscore_ops eiointc_syscore_ops = { .suspend = eiointc_suspend, .resume = eiointc_resume, }; +static struct syscore eiointc_syscore = { + .ops = &eiointc_syscore_ops, +}; + static int __init pch_pic_parse_madt(union acpi_subtable_headers *header, const unsigned long end) { @@ -540,7 +544,7 @@ static int __init eiointc_init(struct eiointc_priv *priv, int parent_irq, eiointc_router_init(0); if (nr_pics == 1) { - register_syscore_ops(&eiointc_syscore_ops); + register_syscore(&eiointc_syscore); cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_EIOINTC_STARTING, "irqchip/loongarch/eiointc:starting", eiointc_router_init, NULL); diff --git a/drivers/irqchip/irq-loongson-htpic.c b/drivers/irqchip/irq-loongson-htpic.c index f4abdf156de7..1c691c4be989 100644 --- a/drivers/irqchip/irq-loongson-htpic.c +++ b/drivers/irqchip/irq-loongson-htpic.c @@ -71,15 +71,19 @@ static void htpic_reg_init(void) writel(0xffff, htpic->base + HTINT_EN_OFF); } -static void htpic_resume(void) +static void htpic_resume(void *data) { htpic_reg_init(); } -struct syscore_ops htpic_syscore_ops = { +static const struct syscore_ops htpic_syscore_ops = { .resume = htpic_resume, }; +static struct syscore htpic_syscore = { + .ops = &htpic_syscore_ops, +}; + static int __init htpic_of_init(struct device_node *node, struct device_node *parent) { unsigned int parent_irq[4]; @@ -130,7 +134,7 @@ static int __init htpic_of_init(struct device_node *node, struct device_node *pa htpic_irq_dispatch, htpic); } - register_syscore_ops(&htpic_syscore_ops); + register_syscore(&htpic_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-htvec.c b/drivers/irqchip/irq-loongson-htvec.c index d8558eb35044..d2be8e954e92 100644 --- a/drivers/irqchip/irq-loongson-htvec.c +++ b/drivers/irqchip/irq-loongson-htvec.c @@ -159,7 +159,7 @@ static void htvec_reset(struct htvec *priv) } } -static int htvec_suspend(void) +static int htvec_suspend(void *data) { int i; @@ -169,7 +169,7 @@ static int htvec_suspend(void) return 0; } -static void htvec_resume(void) +static void htvec_resume(void *data) { int i; @@ -177,11 +177,15 @@ static void htvec_resume(void) writel(htvec_priv->saved_vec_en[i], htvec_priv->base + HTVEC_EN_OFF + 4 * i); } -static struct syscore_ops htvec_syscore_ops = { +static const struct syscore_ops htvec_syscore_ops = { .suspend = htvec_suspend, .resume = htvec_resume, }; +static struct syscore htvec_syscore = { + .ops = &htvec_syscore_ops, +}; + static int htvec_init(phys_addr_t addr, unsigned long size, int num_parents, int parent_irq[], struct fwnode_handle *domain_handle) { @@ -214,7 +218,7 @@ static int htvec_init(phys_addr_t addr, unsigned long size, htvec_priv = priv; - register_syscore_ops(&htvec_syscore_ops); + register_syscore(&htvec_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-pch-lpc.c b/drivers/irqchip/irq-loongson-pch-lpc.c index 912bf50a5c7c..3a125f3e4287 100644 --- a/drivers/irqchip/irq-loongson-pch-lpc.c +++ b/drivers/irqchip/irq-loongson-pch-lpc.c @@ -151,7 +151,7 @@ static int pch_lpc_disabled(struct pch_lpc *priv) (readl(priv->base + LPC_INT_STS) == 0xffffffff); } -static int pch_lpc_suspend(void) +static int pch_lpc_suspend(void *data) { pch_lpc_priv->saved_reg_ctl = readl(pch_lpc_priv->base + LPC_INT_CTL); pch_lpc_priv->saved_reg_ena = readl(pch_lpc_priv->base + LPC_INT_ENA); @@ -159,18 +159,22 @@ static int pch_lpc_suspend(void) return 0; } -static void pch_lpc_resume(void) +static void pch_lpc_resume(void *data) { writel(pch_lpc_priv->saved_reg_ctl, pch_lpc_priv->base + LPC_INT_CTL); writel(pch_lpc_priv->saved_reg_ena, pch_lpc_priv->base + LPC_INT_ENA); writel(pch_lpc_priv->saved_reg_pol, pch_lpc_priv->base + LPC_INT_POL); } -static struct syscore_ops pch_lpc_syscore_ops = { +static const struct syscore_ops pch_lpc_syscore_ops = { .suspend = pch_lpc_suspend, .resume = pch_lpc_resume, }; +static struct syscore pch_lpc_syscore = { + .ops = &pch_lpc_syscore_ops, +}; + int __init pch_lpc_acpi_init(struct irq_domain *parent, struct acpi_madt_lpc_pic *acpi_pchlpc) { @@ -222,7 +226,7 @@ int __init pch_lpc_acpi_init(struct irq_domain *parent, pch_lpc_priv = priv; pch_lpc_handle = irq_handle; - register_syscore_ops(&pch_lpc_syscore_ops); + register_syscore(&pch_lpc_syscore); return 0; diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c index 62e6bf3a0611..c6b369a974a7 100644 --- a/drivers/irqchip/irq-loongson-pch-pic.c +++ b/drivers/irqchip/irq-loongson-pch-pic.c @@ -278,7 +278,7 @@ static void pch_pic_reset(struct pch_pic *priv) } } -static int pch_pic_suspend(void) +static int pch_pic_suspend(void *data) { int i, j; @@ -296,7 +296,7 @@ static int pch_pic_suspend(void) return 0; } -static void pch_pic_resume(void) +static void pch_pic_resume(void *data) { int i, j; @@ -313,11 +313,15 @@ static void pch_pic_resume(void) } } -static struct syscore_ops pch_pic_syscore_ops = { +static const struct syscore_ops pch_pic_syscore_ops = { .suspend = pch_pic_suspend, .resume = pch_pic_resume, }; +static struct syscore pch_pic_syscore = { + .ops = &pch_pic_syscore_ops, +}; + static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base, struct irq_domain *parent_domain, struct fwnode_handle *domain_handle, u32 gsi_base) @@ -356,7 +360,7 @@ static int pch_pic_init(phys_addr_t addr, unsigned long size, int vec_base, pch_pic_priv[nr_pics++] = priv; if (nr_pics == 1) - register_syscore_ops(&pch_pic_syscore_ops); + register_syscore(&pch_pic_syscore); return 0; diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c index 516a3a0e359c..be83e6e422c3 100644 --- a/drivers/irqchip/irq-mchp-eic.c +++ b/drivers/irqchip/irq-mchp-eic.c @@ -109,7 +109,7 @@ static int mchp_eic_irq_set_wake(struct irq_data *d, unsigned int on) return 0; } -static int mchp_eic_irq_suspend(void) +static int mchp_eic_irq_suspend(void *data) { unsigned int hwirq; @@ -123,7 +123,7 @@ static int mchp_eic_irq_suspend(void) return 0; } -static void mchp_eic_irq_resume(void) +static void mchp_eic_irq_resume(void *data) { unsigned int hwirq; @@ -135,11 +135,15 @@ static void mchp_eic_irq_resume(void) MCHP_EIC_SCFG(hwirq)); } -static struct syscore_ops mchp_eic_syscore_ops = { +static const struct syscore_ops mchp_eic_syscore_ops = { .suspend = mchp_eic_irq_suspend, .resume = mchp_eic_irq_resume, }; +static struct syscore mchp_eic_syscore = { + .ops = &mchp_eic_syscore_ops, +}; + static struct irq_chip mchp_eic_chip = { .name = "eic", .flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SET_TYPE_MASKED, @@ -257,7 +261,7 @@ static int mchp_eic_init(struct device_node *node, struct device_node *parent) goto clk_unprepare; } - register_syscore_ops(&mchp_eic_syscore_ops); + register_syscore(&mchp_eic_syscore); pr_info("%pOF: EIC registered, nr_irqs %u\n", node, MCHP_EIC_NIRQ); diff --git a/drivers/irqchip/irq-mst-intc.c b/drivers/irqchip/irq-mst-intc.c index 9643cc3a77d7..7f760f555a76 100644 --- a/drivers/irqchip/irq-mst-intc.c +++ b/drivers/irqchip/irq-mst-intc.c @@ -143,7 +143,7 @@ static void mst_intc_polarity_restore(struct mst_intc_chip_data *cd) writew_relaxed(cd->saved_polarity_conf[i], addr + i * 4); } -static void mst_irq_resume(void) +static void mst_irq_resume(void *data) { struct mst_intc_chip_data *cd; @@ -151,7 +151,7 @@ static void mst_irq_resume(void) mst_intc_polarity_restore(cd); } -static int mst_irq_suspend(void) +static int mst_irq_suspend(void *data) { struct mst_intc_chip_data *cd; @@ -160,14 +160,18 @@ static int mst_irq_suspend(void) return 0; } -static struct syscore_ops mst_irq_syscore_ops = { +static const struct syscore_ops mst_irq_syscore_ops = { .suspend = mst_irq_suspend, .resume = mst_irq_resume, }; +static struct syscore mst_irq_syscore = { + .ops = &mst_irq_syscore_ops, +}; + static int __init mst_irq_pm_init(void) { - register_syscore_ops(&mst_irq_syscore_ops); + register_syscore(&mst_irq_syscore); return 0; } late_initcall(mst_irq_pm_init); diff --git a/drivers/irqchip/irq-mtk-cirq.c b/drivers/irqchip/irq-mtk-cirq.c index de481ba340f8..9571f622774e 100644 --- a/drivers/irqchip/irq-mtk-cirq.c +++ b/drivers/irqchip/irq-mtk-cirq.c @@ -199,7 +199,7 @@ static const struct irq_domain_ops cirq_domain_ops = { }; #ifdef CONFIG_PM_SLEEP -static int mtk_cirq_suspend(void) +static int mtk_cirq_suspend(void *data) { void __iomem *reg; u32 value, mask; @@ -257,7 +257,7 @@ static int mtk_cirq_suspend(void) return 0; } -static void mtk_cirq_resume(void) +static void mtk_cirq_resume(void *data) { void __iomem *reg = mtk_cirq_reg(cirq_data, CIRQ_CONTROL); u32 value; @@ -272,14 +272,18 @@ static void mtk_cirq_resume(void) writel_relaxed(value, reg); } -static struct syscore_ops mtk_cirq_syscore_ops = { +static const struct syscore_ops mtk_cirq_syscore_ops = { .suspend = mtk_cirq_suspend, .resume = mtk_cirq_resume, }; +static struct syscore mtk_cirq_syscore = { + .ops = &mtk_cirq_syscore_ops, +}; + static void mtk_cirq_syscore_init(void) { - register_syscore_ops(&mtk_cirq_syscore_ops); + register_syscore(&mtk_cirq_syscore); } #else static inline void mtk_cirq_syscore_init(void) {} diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 2a54adeb4cc7..de88e80eac0c 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -399,7 +399,7 @@ static int rzg2l_irqc_set_type(struct irq_data *d, unsigned int type) return irq_chip_set_type_parent(d, IRQ_TYPE_LEVEL_HIGH); } -static int rzg2l_irqc_irq_suspend(void) +static int rzg2l_irqc_irq_suspend(void *data) { struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; void __iomem *base = rzg2l_irqc_data->base; @@ -411,7 +411,7 @@ static int rzg2l_irqc_irq_suspend(void) return 0; } -static void rzg2l_irqc_irq_resume(void) +static void rzg2l_irqc_irq_resume(void *data) { struct rzg2l_irqc_reg_cache *cache = &rzg2l_irqc_data->cache; void __iomem *base = rzg2l_irqc_data->base; @@ -426,11 +426,15 @@ static void rzg2l_irqc_irq_resume(void) writel_relaxed(cache->iitsr, base + IITSR); } -static struct syscore_ops rzg2l_irqc_syscore_ops = { +static const struct syscore_ops rzg2l_irqc_syscore_ops = { .suspend = rzg2l_irqc_irq_suspend, .resume = rzg2l_irqc_irq_resume, }; +static struct syscore rzg2l_irqc_syscore = { + .ops = &rzg2l_irqc_syscore_ops, +}; + static const struct irq_chip rzg2l_irqc_chip = { .name = "rzg2l-irqc", .irq_eoi = rzg2l_irqc_eoi, @@ -581,7 +585,7 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * return -ENOMEM; } - register_syscore_ops(&rzg2l_irqc_syscore_ops); + register_syscore(&rzg2l_irqc_syscore); /* * Prevent the cleanup function from invoking put_device by assigning diff --git a/drivers/irqchip/irq-sa11x0.c b/drivers/irqchip/irq-sa11x0.c index d8d4dff16276..e5f24c5f3f41 100644 --- a/drivers/irqchip/irq-sa11x0.c +++ b/drivers/irqchip/irq-sa11x0.c @@ -85,7 +85,7 @@ static struct sa1100irq_state { unsigned int iccr; } sa1100irq_state; -static int sa1100irq_suspend(void) +static int sa1100irq_suspend(void *data) { struct sa1100irq_state *st = &sa1100irq_state; @@ -102,7 +102,7 @@ static int sa1100irq_suspend(void) return 0; } -static void sa1100irq_resume(void) +static void sa1100irq_resume(void *data) { struct sa1100irq_state *st = &sa1100irq_state; @@ -114,14 +114,18 @@ static void sa1100irq_resume(void) } } -static struct syscore_ops sa1100irq_syscore_ops = { +static const struct syscore_ops sa1100irq_syscore_ops = { .suspend = sa1100irq_suspend, .resume = sa1100irq_resume, }; +static struct syscore sa1100irq_syscore = { + .ops = &sa1100irq_syscore_ops, +}; + static int __init sa1100irq_init_devicefs(void) { - register_syscore_ops(&sa1100irq_syscore_ops); + register_syscore(&sa1100irq_syscore); return 0; } diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index cbd7697bc148..4f59c0ca1537 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -245,7 +245,7 @@ static int plic_irq_set_type(struct irq_data *d, unsigned int type) return IRQ_SET_MASK_OK; } -static int plic_irq_suspend(void) +static int plic_irq_suspend(void *data) { unsigned int i, cpu; unsigned long flags; @@ -277,7 +277,7 @@ static int plic_irq_suspend(void) return 0; } -static void plic_irq_resume(void) +static void plic_irq_resume(void *data) { unsigned int i, index, cpu; unsigned long flags; @@ -308,11 +308,15 @@ static void plic_irq_resume(void) } } -static struct syscore_ops plic_irq_syscore_ops = { +static const struct syscore_ops plic_irq_syscore_ops = { .suspend = plic_irq_suspend, .resume = plic_irq_resume, }; +static struct syscore plic_irq_syscore = { + .ops = &plic_irq_syscore_ops, +}; + static int plic_irqdomain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { @@ -678,7 +682,7 @@ done: cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, "irqchip/sifive/plic:starting", plic_starting_cpu, plic_dying_cpu); - register_syscore_ops(&plic_irq_syscore_ops); + register_syscore(&plic_irq_syscore); plic_global_setup_done = true; } } diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c index 37d4b29763bc..23251831c06e 100644 --- a/drivers/irqchip/irq-sun6i-r.c +++ b/drivers/irqchip/irq-sun6i-r.c @@ -268,7 +268,7 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = { .free = irq_domain_free_irqs_common, }; -static int sun6i_r_intc_suspend(void) +static int sun6i_r_intc_suspend(void *data) { u32 buf[BITS_TO_U32(MAX(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))]; int i; @@ -284,7 +284,7 @@ static int sun6i_r_intc_suspend(void) return 0; } -static void sun6i_r_intc_resume(void) +static void sun6i_r_intc_resume(void *data) { int i; @@ -294,17 +294,21 @@ static void sun6i_r_intc_resume(void) writel_relaxed(0, base + SUN6I_IRQ_ENABLE(i)); } -static void sun6i_r_intc_shutdown(void) +static void sun6i_r_intc_shutdown(void *data) { - sun6i_r_intc_suspend(); + sun6i_r_intc_suspend(data); } -static struct syscore_ops sun6i_r_intc_syscore_ops = { +static const struct syscore_ops sun6i_r_intc_syscore_ops = { .suspend = sun6i_r_intc_suspend, .resume = sun6i_r_intc_resume, .shutdown = sun6i_r_intc_shutdown, }; +static struct syscore sun6i_r_intc_syscore = { + .ops = &sun6i_r_intc_syscore_ops, +}; + static int __init sun6i_r_intc_init(struct device_node *node, struct device_node *parent, const struct sun6i_r_intc_variant *v) @@ -346,10 +350,10 @@ static int __init sun6i_r_intc_init(struct device_node *node, return -ENOMEM; } - register_syscore_ops(&sun6i_r_intc_syscore_ops); + register_syscore(&sun6i_r_intc_syscore); sun6i_r_intc_ack_nmi(); - sun6i_r_intc_resume(); + sun6i_r_intc_resume(NULL); return 0; } diff --git a/drivers/irqchip/irq-tegra.c b/drivers/irqchip/irq-tegra.c index 66cbb9f77ff3..b6382cf6359a 100644 --- a/drivers/irqchip/irq-tegra.c +++ b/drivers/irqchip/irq-tegra.c @@ -132,7 +132,7 @@ static int tegra_set_wake(struct irq_data *d, unsigned int enable) return 0; } -static int tegra_ictlr_suspend(void) +static int tegra_ictlr_suspend(void *data) { unsigned long flags; unsigned int i; @@ -161,7 +161,7 @@ static int tegra_ictlr_suspend(void) return 0; } -static void tegra_ictlr_resume(void) +static void tegra_ictlr_resume(void *data) { unsigned long flags; unsigned int i; @@ -184,14 +184,18 @@ static void tegra_ictlr_resume(void) local_irq_restore(flags); } -static struct syscore_ops tegra_ictlr_syscore_ops = { +static const struct syscore_ops tegra_ictlr_syscore_ops = { .suspend = tegra_ictlr_suspend, .resume = tegra_ictlr_resume, }; +static struct syscore tegra_ictlr_syscore = { + .ops = &tegra_ictlr_syscore_ops, +}; + static void tegra_ictlr_syscore_init(void) { - register_syscore_ops(&tegra_ictlr_syscore_ops); + register_syscore(&tegra_ictlr_syscore); } #else #define tegra_set_wake NULL diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c index 2bcdf216a000..e38104c5064e 100644 --- a/drivers/irqchip/irq-vic.c +++ b/drivers/irqchip/irq-vic.c @@ -120,7 +120,7 @@ static void resume_one_vic(struct vic_device *vic) writel(~vic->soft_int, base + VIC_INT_SOFT_CLEAR); } -static void vic_resume(void) +static void vic_resume(void *data) { int id; @@ -146,7 +146,7 @@ static void suspend_one_vic(struct vic_device *vic) writel(~vic->resume_irqs, base + VIC_INT_ENABLE_CLEAR); } -static int vic_suspend(void) +static int vic_suspend(void *data) { int id; @@ -156,11 +156,15 @@ static int vic_suspend(void) return 0; } -static struct syscore_ops vic_syscore_ops = { +static const struct syscore_ops vic_syscore_ops = { .suspend = vic_suspend, .resume = vic_resume, }; +static struct syscore vic_syscore = { + .ops = &vic_syscore_ops, +}; + /** * vic_pm_init - initcall to register VIC pm * @@ -171,7 +175,7 @@ static struct syscore_ops vic_syscore_ops = { static int __init vic_pm_init(void) { if (vic_id > 0) - register_syscore_ops(&vic_syscore_ops); + register_syscore(&vic_syscore); return 0; } diff --git a/drivers/leds/trigger/ledtrig-cpu.c b/drivers/leds/trigger/ledtrig-cpu.c index 05848a2fecff..679323c2ccda 100644 --- a/drivers/leds/trigger/ledtrig-cpu.c +++ b/drivers/leds/trigger/ledtrig-cpu.c @@ -94,28 +94,32 @@ void ledtrig_cpu(enum cpu_led_event ledevt) } EXPORT_SYMBOL(ledtrig_cpu); -static int ledtrig_cpu_syscore_suspend(void) +static int ledtrig_cpu_syscore_suspend(void *data) { ledtrig_cpu(CPU_LED_STOP); return 0; } -static void ledtrig_cpu_syscore_resume(void) +static void ledtrig_cpu_syscore_resume(void *data) { ledtrig_cpu(CPU_LED_START); } -static void ledtrig_cpu_syscore_shutdown(void) +static void ledtrig_cpu_syscore_shutdown(void *data) { ledtrig_cpu(CPU_LED_HALTED); } -static struct syscore_ops ledtrig_cpu_syscore_ops = { +static const struct syscore_ops ledtrig_cpu_syscore_ops = { .shutdown = ledtrig_cpu_syscore_shutdown, .suspend = ledtrig_cpu_syscore_suspend, .resume = ledtrig_cpu_syscore_resume, }; +static struct syscore ledtrig_cpu_syscore = { + .ops = &ledtrig_cpu_syscore_ops, +}; + static int ledtrig_online_cpu(unsigned int cpu) { ledtrig_cpu(CPU_LED_START); @@ -157,7 +161,7 @@ static int __init ledtrig_cpu_init(void) led_trigger_register_simple(trig->name, &trig->_trig); } - register_syscore_ops(&ledtrig_cpu_syscore_ops); + register_syscore(&ledtrig_cpu_syscore); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "leds/trigger:starting", ledtrig_online_cpu, ledtrig_prepare_down_cpu); diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index b0f09c70f1ff..5fe47e784d43 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -2600,7 +2600,7 @@ void pmu_blink(int n) #if defined(CONFIG_SUSPEND) && defined(CONFIG_PPC32) int pmu_sys_suspended; -static int pmu_syscore_suspend(void) +static int pmu_syscore_suspend(void *data) { /* Suspend PMU event interrupts */ pmu_suspend(); @@ -2614,7 +2614,7 @@ static int pmu_syscore_suspend(void) return 0; } -static void pmu_syscore_resume(void) +static void pmu_syscore_resume(void *data) { struct adb_request req; @@ -2634,14 +2634,18 @@ static void pmu_syscore_resume(void) pmu_sys_suspended = 0; } -static struct syscore_ops pmu_syscore_ops = { +static const struct syscore_ops pmu_syscore_ops = { .suspend = pmu_syscore_suspend, .resume = pmu_syscore_resume, }; +static struct syscore pmu_syscore = { + .ops = &pmu_syscore_ops, +}; + static int pmu_syscore_register(void) { - register_syscore_ops(&pmu_syscore_ops); + register_syscore(&pmu_syscore); return 0; } diff --git a/drivers/power/reset/sc27xx-poweroff.c b/drivers/power/reset/sc27xx-poweroff.c index 90287c31992c..393bd1c33b73 100644 --- a/drivers/power/reset/sc27xx-poweroff.c +++ b/drivers/power/reset/sc27xx-poweroff.c @@ -28,7 +28,7 @@ static struct regmap *regmap; * taking cpus down to avoid racing regmap or spi mutex lock when poweroff * system through PMIC. */ -static void sc27xx_poweroff_shutdown(void) +static void sc27xx_poweroff_shutdown(void *data) { #ifdef CONFIG_HOTPLUG_CPU int cpu; @@ -40,10 +40,14 @@ static void sc27xx_poweroff_shutdown(void) #endif } -static struct syscore_ops poweroff_syscore_ops = { +static const struct syscore_ops poweroff_syscore_ops = { .shutdown = sc27xx_poweroff_shutdown, }; +static struct syscore poweroff_syscore = { + .ops = &poweroff_syscore_ops, +}; + static void sc27xx_poweroff_do_poweroff(void) { /* Disable the external subsys connection's power firstly */ @@ -62,7 +66,7 @@ static int sc27xx_poweroff_probe(struct platform_device *pdev) return -ENODEV; pm_power_off = sc27xx_poweroff_do_poweroff; - register_syscore_ops(&poweroff_syscore_ops); + register_syscore(&poweroff_syscore); return 0; } diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c index 7a73f5e4a1fc..f02e12dfa5f6 100644 --- a/drivers/sh/clk/core.c +++ b/drivers/sh/clk/core.c @@ -569,7 +569,7 @@ long clk_round_rate(struct clk *clk, unsigned long rate) EXPORT_SYMBOL_GPL(clk_round_rate); #ifdef CONFIG_PM -static void clks_core_resume(void) +static void clks_core_resume(void *data) { struct clk *clkp; @@ -588,13 +588,17 @@ static void clks_core_resume(void) } } -static struct syscore_ops clks_syscore_ops = { +static const struct syscore_ops clks_syscore_ops = { .resume = clks_core_resume, }; +static struct syscore clks_syscore = { + .ops = &clks_syscore_ops, +}; + static int __init clk_syscore_init(void) { - register_syscore_ops(&clks_syscore_ops); + register_syscore(&clks_syscore); return 0; } diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c index ea571eeb3078..3dde703b7766 100644 --- a/drivers/sh/intc/core.c +++ b/drivers/sh/intc/core.c @@ -394,7 +394,7 @@ err0: return -ENOMEM; } -static int intc_suspend(void) +static int intc_suspend(void *data) { struct intc_desc_int *d; @@ -420,7 +420,7 @@ static int intc_suspend(void) return 0; } -static void intc_resume(void) +static void intc_resume(void *data) { struct intc_desc_int *d; @@ -450,11 +450,15 @@ static void intc_resume(void) } } -struct syscore_ops intc_syscore_ops = { +static const struct syscore_ops intc_syscore_ops = { .suspend = intc_suspend, .resume = intc_resume, }; +static struct syscore intc_syscore = { + .ops = &intc_syscore_ops, +}; + const struct bus_type intc_subsys = { .name = "intc", .dev_name = "intc", @@ -477,7 +481,7 @@ static int __init register_intc_devs(void) struct intc_desc_int *d; int error; - register_syscore_ops(&intc_syscore_ops); + register_syscore(&intc_syscore); error = subsys_system_register(&intc_subsys, NULL); if (!error) { diff --git a/drivers/soc/bcm/brcmstb/biuctrl.c b/drivers/soc/bcm/brcmstb/biuctrl.c index 364ddbe365c2..bd830649b60d 100644 --- a/drivers/soc/bcm/brcmstb/biuctrl.c +++ b/drivers/soc/bcm/brcmstb/biuctrl.c @@ -298,7 +298,7 @@ out: #ifdef CONFIG_PM_SLEEP static u32 cpubiuctrl_reg_save[NUM_CPU_BIUCTRL_REGS]; -static int brcmstb_cpu_credit_reg_suspend(void) +static int brcmstb_cpu_credit_reg_suspend(void *data) { unsigned int i; @@ -311,7 +311,7 @@ static int brcmstb_cpu_credit_reg_suspend(void) return 0; } -static void brcmstb_cpu_credit_reg_resume(void) +static void brcmstb_cpu_credit_reg_resume(void *data) { unsigned int i; @@ -322,10 +322,14 @@ static void brcmstb_cpu_credit_reg_resume(void) cbc_writel(cpubiuctrl_reg_save[i], i); } -static struct syscore_ops brcmstb_cpu_credit_syscore_ops = { +static const struct syscore_ops brcmstb_cpu_credit_syscore_ops = { .suspend = brcmstb_cpu_credit_reg_suspend, .resume = brcmstb_cpu_credit_reg_resume, }; + +static struct syscore brcmstb_cpu_credit_syscore = { + .ops = &brcmstb_cpu_credit_syscore_ops, +}; #endif @@ -354,7 +358,7 @@ static int __init brcmstb_biuctrl_init(void) a72_b53_rac_enable_all(np); mcp_a72_b53_set(); #ifdef CONFIG_PM_SLEEP - register_syscore_ops(&brcmstb_cpu_credit_syscore_ops); + register_syscore(&brcmstb_cpu_credit_syscore); #endif ret = 0; out_put: diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index 034a2a535a1e..93bbebd68001 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -466,7 +466,7 @@ struct tegra_pmc { unsigned long *wake_type_dual_edge_map; unsigned long *wake_sw_status_map; unsigned long *wake_cntrl_level_map; - struct syscore_ops syscore; + struct syscore syscore; }; static struct tegra_pmc *pmc = &(struct tegra_pmc) { @@ -3147,7 +3147,7 @@ static void tegra186_pmc_process_wake_events(struct tegra_pmc *pmc, unsigned int } } -static void tegra186_pmc_wake_syscore_resume(void) +static void tegra186_pmc_wake_syscore_resume(void *data) { u32 status, mask; unsigned int i; @@ -3160,7 +3160,7 @@ static void tegra186_pmc_wake_syscore_resume(void) } } -static int tegra186_pmc_wake_syscore_suspend(void) +static int tegra186_pmc_wake_syscore_suspend(void *data) { wke_read_sw_wake_status(pmc); @@ -3179,6 +3179,11 @@ static int tegra186_pmc_wake_syscore_suspend(void) return 0; } +static const struct syscore_ops tegra186_pmc_wake_syscore_ops = { + .suspend = tegra186_pmc_wake_syscore_suspend, + .resume = tegra186_pmc_wake_syscore_resume, +}; + #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_ARM) static int tegra_pmc_suspend(struct device *dev) { @@ -3829,10 +3834,8 @@ static const struct tegra_pmc_regs tegra186_pmc_regs = { static void tegra186_pmc_init(struct tegra_pmc *pmc) { - pmc->syscore.suspend = tegra186_pmc_wake_syscore_suspend; - pmc->syscore.resume = tegra186_pmc_wake_syscore_resume; - - register_syscore_ops(&pmc->syscore); + pmc->syscore.ops = &tegra186_pmc_wake_syscore_ops; + register_syscore(&pmc->syscore); } static void tegra186_pmc_setup_irq_polarity(struct tegra_pmc *pmc, diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c index bd2fca7dc017..8a2f441cd2ec 100644 --- a/drivers/thermal/intel/intel_hfi.c +++ b/drivers/thermal/intel/intel_hfi.c @@ -592,7 +592,7 @@ static void hfi_disable_instance(void *ptr) hfi_disable(); } -static void hfi_syscore_resume(void) +static void hfi_syscore_resume(void *data) { /* This code runs only on the boot CPU. */ struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, 0); @@ -603,7 +603,7 @@ static void hfi_syscore_resume(void) hfi_enable_instance(hfi_instance); } -static int hfi_syscore_suspend(void) +static int hfi_syscore_suspend(void *data) { /* No locking needed. There is no concurrency with CPU offline. */ hfi_disable(); @@ -611,11 +611,15 @@ static int hfi_syscore_suspend(void) return 0; } -static struct syscore_ops hfi_pm_ops = { +static const struct syscore_ops hfi_pm_ops = { .resume = hfi_syscore_resume, .suspend = hfi_syscore_suspend, }; +static struct syscore hfi_pm = { + .ops = &hfi_pm_ops, +}; + static int hfi_thermal_notify(struct notifier_block *nb, unsigned long state, void *_notify) { @@ -710,7 +714,7 @@ void __init intel_hfi_init(void) if (thermal_genl_register_notifier(&hfi_thermal_nb)) goto err_nl_notif; - register_syscore_ops(&hfi_pm_ops); + register_syscore(&hfi_pm); return; diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 296703939846..f2e8eaf684ba 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -495,7 +495,7 @@ static void xen_acpi_processor_resume_worker(struct work_struct *dummy) pr_info("ACPI data upload failed, error = %d\n", rc); } -static void xen_acpi_processor_resume(void) +static void xen_acpi_processor_resume(void *data) { static DECLARE_WORK(wq, xen_acpi_processor_resume_worker); @@ -509,10 +509,14 @@ static void xen_acpi_processor_resume(void) schedule_work(&wq); } -static struct syscore_ops xap_syscore_ops = { +static const struct syscore_ops xap_syscore_ops = { .resume = xen_acpi_processor_resume, }; +static struct syscore xap_syscore = { + .ops = &xap_syscore_ops, +}; + static int __init xen_acpi_processor_init(void) { int i; @@ -563,7 +567,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + register_syscore(&xap_syscore); return 0; err_unregister: @@ -580,7 +584,7 @@ static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); + unregister_syscore(&xap_syscore); bitmap_free(acpi_ids_done); bitmap_free(acpi_id_present); bitmap_free(acpi_id_cst_present); diff --git a/include/linux/syscore_ops.h b/include/linux/syscore_ops.h index ae4d48e4c970..ac6d71be5c38 100644 --- a/include/linux/syscore_ops.h +++ b/include/linux/syscore_ops.h @@ -11,14 +11,19 @@ #include struct syscore_ops { + int (*suspend)(void *data); + void (*resume)(void *data); + void (*shutdown)(void *data); +}; + +struct syscore { struct list_head node; - int (*suspend)(void); - void (*resume)(void); - void (*shutdown)(void); + const struct syscore_ops *ops; + void *data; }; -extern void register_syscore_ops(struct syscore_ops *ops); -extern void unregister_syscore_ops(struct syscore_ops *ops); +extern void register_syscore(struct syscore *syscore); +extern void unregister_syscore(struct syscore *syscore); #ifdef CONFIG_PM_SLEEP extern int syscore_suspend(void); extern void syscore_resume(void); diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index b0f0d15085db..7481fbb947d3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -173,7 +173,7 @@ int cpu_cluster_pm_exit(void) EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); #ifdef CONFIG_PM -static int cpu_pm_suspend(void) +static int cpu_pm_suspend(void *data) { int ret; @@ -185,20 +185,24 @@ static int cpu_pm_suspend(void) return ret; } -static void cpu_pm_resume(void) +static void cpu_pm_resume(void *data) { cpu_cluster_pm_exit(); cpu_pm_exit(); } -static struct syscore_ops cpu_pm_syscore_ops = { +static const struct syscore_ops cpu_pm_syscore_ops = { .suspend = cpu_pm_suspend, .resume = cpu_pm_resume, }; +static struct syscore cpu_pm_syscore = { + .ops = &cpu_pm_syscore_ops, +}; + static int cpu_pm_init(void) { - register_syscore_ops(&cpu_pm_syscore_ops); + register_syscore(&cpu_pm_syscore); return 0; } core_initcall(cpu_pm_init); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index bf59e37d650a..3cd0c40282c0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -650,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc) } #ifdef CONFIG_PM -static int irq_gc_suspend(void) +static int irq_gc_suspend(void *data) { struct irq_chip_generic *gc; @@ -670,7 +670,7 @@ static int irq_gc_suspend(void) return 0; } -static void irq_gc_resume(void) +static void irq_gc_resume(void *data) { struct irq_chip_generic *gc; @@ -693,7 +693,7 @@ static void irq_gc_resume(void) #define irq_gc_resume NULL #endif -static void irq_gc_shutdown(void) +static void irq_gc_shutdown(void *data) { struct irq_chip_generic *gc; @@ -709,15 +709,19 @@ static void irq_gc_shutdown(void) } } -static struct syscore_ops irq_gc_syscore_ops = { +static const struct syscore_ops irq_gc_syscore_ops = { .suspend = irq_gc_suspend, .resume = irq_gc_resume, .shutdown = irq_gc_shutdown, }; +static struct syscore irq_gc_syscore = { + .ops = &irq_gc_syscore_ops, +}; + static int __init irq_gc_init_ops(void) { - register_syscore_ops(&irq_gc_syscore_ops); + register_syscore(&irq_gc_syscore); return 0; } device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f7394729cedc..99ff65466d87 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -211,21 +211,26 @@ void rearm_wake_irq(unsigned int irq) /** * irq_pm_syscore_resume - enable interrupt lines early + * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ -static void irq_pm_syscore_resume(void) +static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } -static struct syscore_ops irq_pm_syscore_ops = { +static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; +static struct syscore irq_pm_syscore = { + .ops = &irq_pm_syscore_ops, +}; + static int __init irq_pm_init_ops(void) { - register_syscore_ops(&irq_pm_syscore_ops); + register_syscore(&irq_pm_syscore); return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5aee9ffb16b9..f852dce94015 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3639,12 +3639,13 @@ static bool legacy_kthread_create(void) /** * printk_kthreads_shutdown - shutdown all threaded printers + * @data: syscore context * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ -static void printk_kthreads_shutdown(void) +static void printk_kthreads_shutdown(void *data) { struct console *con; @@ -3666,10 +3667,14 @@ static void printk_kthreads_shutdown(void) console_list_unlock(); } -static struct syscore_ops printk_syscore_ops = { +static const struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; +static struct syscore printk_syscore = { + .ops = &printk_syscore_ops, +}; + /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. @@ -3737,7 +3742,7 @@ static void printk_kthreads_check_locked(void) static int __init printk_set_kthreads_ready(void) { - register_syscore_ops(&printk_syscore_ops); + register_syscore(&printk_syscore); console_list_lock(); printk_kthreads_ready = true; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cc1afec306b3..f39111830ca3 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -296,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -305,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b6974fce800c..f3513679ee09 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1994,6 +1994,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -2061,15 +2066,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 226faeaa8e56..b7675a58d663 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5629,7 +5629,7 @@ static int kvm_offline_cpu(unsigned int cpu) return 0; } -static void kvm_shutdown(void) +static void kvm_shutdown(void *data) { /* * Disable hardware virtualization and set kvm_rebooting to indicate @@ -5647,7 +5647,7 @@ static void kvm_shutdown(void) on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } -static int kvm_suspend(void) +static int kvm_suspend(void *data) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume @@ -5664,7 +5664,7 @@ static int kvm_suspend(void) return 0; } -static void kvm_resume(void) +static void kvm_resume(void *data) { lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); @@ -5672,12 +5672,16 @@ static void kvm_resume(void) WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } -static struct syscore_ops kvm_syscore_ops = { +static const struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, .shutdown = kvm_shutdown, }; +static struct syscore kvm_syscore = { + .ops = &kvm_syscore_ops, +}; + int kvm_enable_virtualization(void) { int r; @@ -5694,7 +5698,7 @@ int kvm_enable_virtualization(void) if (r) goto err_cpuhp; - register_syscore_ops(&kvm_syscore_ops); + register_syscore(&kvm_syscore); /* * Undo virtualization enabling and bail if the system is going down. @@ -5716,7 +5720,7 @@ int kvm_enable_virtualization(void) return 0; err_rebooting: - unregister_syscore_ops(&kvm_syscore_ops); + unregister_syscore(&kvm_syscore); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); err_cpuhp: kvm_arch_disable_virtualization(); @@ -5732,7 +5736,7 @@ void kvm_disable_virtualization(void) if (--kvm_usage_count) return; - unregister_syscore_ops(&kvm_syscore_ops); + unregister_syscore(&kvm_syscore); cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -- cgit v1.2.3 From c42ba5a87bdccbca11403b7ca8bad1a57b833732 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 10 Nov 2025 10:38:52 +0100 Subject: futex: Store time as ktime_t in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The futex core uses ktime_t to represent times, use that also for the restart block. This allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-2-5d39cc93df4f@linutronix.de --- include/linux/restart_block.h | 2 +- kernel/futex/waitwake.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..4f9316e7590d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -32,7 +32,7 @@ struct restart_block { u32 val; u32 flags; u32 bitset; - u64 time; + ktime_t time; u32 __user *uaddr2; } futex; /* For nanosleep */ diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec2..1c2dd03f11ec 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -738,12 +738,11 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - ktime_t t, *tp = NULL; + ktime_t *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) + tp = &restart->futex.time; - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t = restart->futex.time; - tp = &t; - } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, -- cgit v1.2.3 From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:49 -0500 Subject: KVM: SEV: Consolidate the SEV policy bits in a single header file Consolidate SEV policy bit definitions into a single file. Use include/linux/psp-sev.h to hold the definitions and remove the current definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h files. No functional change intended. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 16 ++++------------ arch/x86/kvm/svm/svm.h | 3 --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0835c664fbfd..f04589ae76bb 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,15 +65,7 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ -#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) -#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) -#define SNP_POLICY_MASK_SMT BIT_ULL(16) -#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) -#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) -#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) - -#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ SNP_POLICY_MASK_API_MAJOR | \ SNP_POLICY_MASK_SMT | \ SNP_POLICY_MASK_RSVD_MBO | \ @@ -2207,7 +2199,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~SNP_POLICY_MASK_VALID) + if (params.policy & ~KVM_SNP_POLICY_MASK_VALID) return -EINVAL; /* Check for policy bits that must be set */ @@ -5085,10 +5077,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) /* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { - if (!(sev->policy & SNP_POLICY_DEBUG)) + if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) return NULL; } else { - if (sev->policy & SEV_POLICY_NODBG) + if (sev->policy & SEV_POLICY_MASK_NODBG) return NULL; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6765a5e433ce..a9f6c1ece63d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -117,9 +117,6 @@ struct kvm_sev_info { cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; -#define SEV_POLICY_NODBG BIT_ULL(0) -#define SNP_POLICY_DEBUG BIT_ULL(19) - struct kvm_svm { struct kvm kvm; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..27c92543bf38 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,25 @@ #include +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** -- cgit v1.2.3 From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:50 -0500 Subject: crypto: ccp - Add an API to return the supported SEV-SNP policy bits Supported policy bits are dependent on the level of SEV firmware that is currently running. Create an API to return the supported policy bits for the current level of firmware. Signed-off-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- drivers/crypto/ccp/sev-dev.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/psp-sev.h | 18 ++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..db7c7c50cebc 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void) } EXPORT_SYMBOL_GPL(sev_platform_shutdown); +u64 sev_get_snp_policy_bits(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + u64 policy_bits; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + + policy_bits = SNP_POLICY_MASK_BASE; + + if (sev->snp_plat_status.feature_info) { + if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_RAPL_DIS; + + if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM; + + if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS; + + if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CXL_ALLOW; + + if (sev_version_greater_or_equal(1, 58)) + policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE; + } + + return policy_bits; +} +EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 27c92543bf38..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -32,6 +32,20 @@ #define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) #define SNP_POLICY_MASK_DEBUG BIT_ULL(19) #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ @@ -868,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ -- cgit v1.2.3 From 337b1b566db087347194e4543ddfdfa5645275cc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 18:26:23 +0200 Subject: PCI: Fix restoring BARs on BAR resize rollback path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BAR resize operation is implemented in the pci_resize_resource() and pbus_reassign_bridge_resources() functions. pci_resize_resource() can be called either from __resource_resize_store() from sysfs or directly by the driver for the Endpoint Device. The pci_resize_resource() requires that caller has released the device resources that share the bridge window with the BAR to be resized as otherwise the bridge window is pinned in place and cannot be changed. pbus_reassign_bridge_resources() rolls back resources if the resize operation fails, but rollback is performed only for the bridge windows. Because releasing the device resources are done by the caller of the BAR resize interface, these functions performing the BAR resize do not have access to the device resources as they were before the resize. pbus_reassign_bridge_resources() could try __pci_bridge_assign_resources() after rolling back the bridge windows as they were, however, it will not guarantee the resource are assigned due to differences in how FW and the kernel assign the resources (alignment of the start address and tail). To perform rollback robustly, the BAR resize interface has to be altered to also release the device resources that share the bridge window with the BAR to be resized. Also, remove restoring from the entries failed list as saved list should now contain both the bridge windows and device resources so the extra restore is duplicated work. Some drivers (currently only amdgpu) want to prevent releasing some resources. Add exclude_bars param to pci_resize_resource() and make amdgpu pass its register BAR (BAR 2 or 5), which should never be released during resize operation. Normally 64-bit prefetchable resources do not share a bridge window with the 32-bit only register BAR, but there are various fallbacks in the resource assignment logic which may make the resources share the bridge window in rare cases. This change (together with the driver side changes) is to counter the resource releases that had to be done to prevent resource tree corruption in the ("PCI: Release assigned resource before restoring them") change. As such, it likely restores functionality in cases where device resources were released to avoid resource tree conflicts which appeared to be "working" when such conflicts were not correctly detected by the kernel. Reported-by: Simon Richter Link: https://lore.kernel.org/linux-pci/f9a8c975-f5d3-4dd2-988e-4371a1433a60@hogyros.de/ Reported-by: Alex Bennée Link: https://lore.kernel.org/linux-pci/874irqop6b.fsf@draig.linaro.org/ Signed-off-by: Ilpo Järvinen [bhelgaas: squash amdgpu BAR selection from https://lore.kernel.org/r/20251114103053.13778-1-ilpo.jarvinen@linux.intel.com] Signed-off-by: Bjorn Helgaas Tested-by: Alex Bennée # AVA, AMD GPU Reviewed-by: Christian König Link: https://patch.msgid.link/20251113162628.5946-7-ilpo.jarvinen@linux.intel.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +- drivers/gpu/drm/i915/gt/intel_region_lmem.c | 2 +- drivers/gpu/drm/xe/xe_vram.c | 2 +- drivers/pci/pci-sysfs.c | 17 +---- drivers/pci/pci.h | 4 +- drivers/pci/setup-bus.c | 100 +++++++++++++++++++++------- drivers/pci/setup-res.c | 23 ++----- include/linux/pci.h | 3 +- 8 files changed, 93 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7a899fb4de29..bf0bc38e1c47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1736,7 +1736,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) pci_release_resource(adev->pdev, 0); - r = pci_resize_resource(adev->pdev, 0, rbar_size); + r = pci_resize_resource(adev->pdev, 0, rbar_size, + (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 + : 1 << 2); if (r == -ENOSPC) dev_info(adev->dev, "Not enough PCI address space for a large BAR."); diff --git a/drivers/gpu/drm/i915/gt/intel_region_lmem.c b/drivers/gpu/drm/i915/gt/intel_region_lmem.c index 51bb27e10a4f..7699e8fcf5ed 100644 --- a/drivers/gpu/drm/i915/gt/intel_region_lmem.c +++ b/drivers/gpu/drm/i915/gt/intel_region_lmem.c @@ -37,7 +37,7 @@ _resize_bar(struct drm_i915_private *i915, int resno, resource_size_t size) _release_bars(pdev); - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&i915->drm, "Failed to resize BAR%d to %dM (%pe)\n", resno, 1 << bar_size, ERR_PTR(ret)); diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index b44ebf50fedb..00dd027057df 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -36,7 +36,7 @@ _resize_bar(struct xe_device *xe, int resno, resource_size_t size) if (pci_resource_len(pdev, resno)) pci_release_resource(pdev, resno); - ret = pci_resize_resource(pdev, resno, bar_size); + ret = pci_resize_resource(pdev, resno, bar_size, 0); if (ret) { drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n", resno, 1 << bar_size, ERR_PTR(ret)); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9d6f74bd95f8..2a1b5456c2dc 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1599,18 +1599,13 @@ static ssize_t __resource_resize_store(struct device *dev, int n, { struct pci_dev *pdev = to_pci_dev(dev); struct pci_bus *bus = pdev->bus; - struct resource *b_win, *res; unsigned long size; - int ret, i; + int ret; u16 cmd; if (kstrtoul(buf, 0, &size) < 0) return -EINVAL; - b_win = pbus_select_window(bus, pci_resource_n(pdev, n)); - if (!b_win) - return -EINVAL; - device_lock(dev); if (dev->driver || pci_num_vf(pdev)) { ret = -EBUSY; @@ -1632,15 +1627,7 @@ static ssize_t __resource_resize_store(struct device *dev, int n, pci_remove_resource_files(pdev); - pci_dev_for_each_resource(pdev, res, i) { - if (i >= PCI_BRIDGE_RESOURCES) - break; - - if (b_win == pbus_select_window(bus, res)) - pci_release_resource(pdev, i); - } - - ret = pci_resize_resource(pdev, n, size); + ret = pci_resize_resource(pdev, n, size, 0); pci_assign_unassigned_bus_resources(bus); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index bf1a577e9623..9893ea12d1f2 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -421,8 +421,10 @@ enum pci_bar_type { struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size); +int pci_do_resource_release_and_resize(struct pci_dev *dev, int resno, int size, + int exclude_bars); unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 51f5e5a80b54..7e268960954b 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2420,18 +2420,16 @@ EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources); * release it when possible. If the bridge window contains assigned * resources, it cannot be released. */ -int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) +static int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res, + struct list_head *saved) { unsigned long type = res->flags; struct pci_dev_resource *dev_res; struct pci_dev *bridge = NULL; - LIST_HEAD(saved); LIST_HEAD(added); LIST_HEAD(failed); unsigned int i; - int ret; - - down_read(&pci_bus_sem); + int ret = 0; while (!pci_is_root_bus(bus)) { bridge = bus->self; @@ -2443,9 +2441,9 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) /* Ignore BARs which are still in use */ if (!res->child) { - ret = add_to_list(&saved, bridge, res, 0, 0); + ret = add_to_list(saved, bridge, res, 0, 0); if (ret) - goto cleanup; + return ret; pci_release_resource(bridge, i); } else { @@ -2468,34 +2466,78 @@ int pbus_reassign_bridge_resources(struct pci_bus *bus, struct resource *res) free_list(&added); if (!list_empty(&failed)) { - if (pci_required_resource_failed(&failed, type)) { + if (pci_required_resource_failed(&failed, type)) ret = -ENOSPC; - goto cleanup; - } - /* Only resources with unrelated types failed (again) */ free_list(&failed); + if (ret) + return ret; + + /* Only resources with unrelated types failed (again) */ } - list_for_each_entry(dev_res, &saved, list) { + list_for_each_entry(dev_res, saved, list) { struct pci_dev *dev = dev_res->dev; /* Skip the bridge we just assigned resources for */ if (bridge == dev) continue; + if (!dev->subordinate) + continue; + pci_setup_bridge(dev->subordinate); } - free_list(&saved); - up_read(&pci_bus_sem); return 0; +} -cleanup: - /* Restore size and flags */ - list_for_each_entry(dev_res, &failed, list) - restore_dev_resource(dev_res); - free_list(&failed); +int pci_do_resource_release_and_resize(struct pci_dev *pdev, int resno, int size, + int exclude_bars) +{ + struct resource *res = pci_resource_n(pdev, resno); + struct pci_dev_resource *dev_res; + struct pci_bus *bus = pdev->bus; + struct resource *b_win, *r; + LIST_HEAD(saved); + unsigned int i; + int ret = 0; + + b_win = pbus_select_window(bus, res); + if (!b_win) + return -EINVAL; + + pci_dev_for_each_resource(pdev, r, i) { + if (i >= PCI_BRIDGE_RESOURCES) + break; + + if (exclude_bars & BIT(i)) + continue; + if (b_win != pbus_select_window(bus, r)) + continue; + + ret = add_to_list(&saved, pdev, r, 0, 0); + if (ret) + goto restore; + pci_release_resource(pdev, i); + } + + pci_resize_resource_set_size(pdev, resno, size); + + if (!bus->self) + goto out; + + down_read(&pci_bus_sem); + ret = pbus_reassign_bridge_resources(bus, res, &saved); + if (ret) + goto restore; + +out: + up_read(&pci_bus_sem); + free_list(&saved); + return ret; + +restore: /* Revert to the old configuration */ list_for_each_entry(dev_res, &saved, list) { struct resource *res = dev_res->res; @@ -2510,13 +2552,21 @@ cleanup: restore_dev_resource(dev_res); - pci_claim_resource(dev, i); - pci_setup_bridge(dev->subordinate); - } - up_read(&pci_bus_sem); - free_list(&saved); + ret = pci_claim_resource(dev, i); + if (ret) + continue; - return ret; + if (i < PCI_BRIDGE_RESOURCES) { + const char *res_name = pci_resource_name(dev, i); + + pci_update_resource(dev, i); + pci_info(dev, "%s %pR: old value restored\n", + res_name, res); + } + if (dev->subordinate) + pci_setup_bridge(dev->subordinate); + } + goto out; } void pci_assign_unassigned_bus_resources(struct pci_bus *bus) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 3d0b0b3f60c4..e4486d7030c0 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -444,8 +444,7 @@ static bool pci_resize_is_memory_decoding_enabled(struct pci_dev *dev, return cmd & PCI_COMMAND_MEMORY; } -static void pci_resize_resource_set_size(struct pci_dev *dev, int resno, - int size) +void pci_resize_resource_set_size(struct pci_dev *dev, int resno, int size) { resource_size_t res_size = pci_rebar_size_to_bytes(size); struct resource *res = pci_resource_n(dev, resno); @@ -456,9 +455,9 @@ static void pci_resize_resource_set_size(struct pci_dev *dev, int resno, resource_set_size(res, res_size); } -int pci_resize_resource(struct pci_dev *dev, int resno, int size) +int pci_resize_resource(struct pci_dev *dev, int resno, int size, + int exclude_bars) { - struct resource *res = pci_resource_n(dev, resno); struct pci_host_bridge *host; int old, ret; u32 sizes; @@ -468,10 +467,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size) if (host->preserve_config) return -ENOTSUPP; - /* Make sure the resource isn't assigned before resizing it. */ - if (!(res->flags & IORESOURCE_UNSET)) - return -EBUSY; - if (pci_resize_is_memory_decoding_enabled(dev, resno)) return -EBUSY; @@ -490,19 +485,13 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size) if (ret) return ret; - pci_resize_resource_set_size(dev, resno, size); - - /* Check if the new config works by trying to assign everything. */ - if (dev->bus->self) { - ret = pbus_reassign_bridge_resources(dev->bus, res); - if (ret) - goto error_resize; - } + ret = pci_do_resource_release_and_resize(dev, resno, size, exclude_bars); + if (ret) + goto error_resize; return 0; error_resize: pci_rebar_set_size(dev, resno, old); - pci_resize_resource_set_size(dev, resno, old); return ret; } EXPORT_SYMBOL(pci_resize_resource); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..34ff295cd2e3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1428,7 +1428,8 @@ static inline int pci_rebar_bytes_to_size(u64 bytes) } u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); -int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size); +int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, + int exclude_bars); int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From 876e15943e9205096441cbe520dc9ccf82df8344 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:44 +0200 Subject: PCI: Move pci_rebar_bytes_to_size() and clean it up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move pci_rebar_bytes_to_size() from include/linux/pci.h to rebar.c as it does not look very trivial and is not expected to be performance critical. Convert literals to use a newly added PCI_REBAR_MIN_SIZE define. Also add kernel doc for the function as the function is exported. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20251113180053.27944-3-ilpo.jarvinen@linux.intel.com --- drivers/pci/rebar.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 10 +++------- 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index f6ed7e4893a7..f29810fe0a58 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -7,11 +7,34 @@ #include #include #include +#include #include +#include #include #include "pci.h" +#define PCI_REBAR_MIN_SIZE ((resource_size_t)SZ_1M) + +/** + * pci_rebar_bytes_to_size - Convert size in bytes to PCI BAR Size + * @bytes: size in bytes + * + * Convert size in bytes to encoded BAR Size in Resizable BAR Capability + * (PCIe r6.2, sec. 7.8.6.3). + * + * Return: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + */ +int pci_rebar_bytes_to_size(u64 bytes) +{ + int rebar_minsize = ilog2(PCI_REBAR_MIN_SIZE); + + bytes = roundup_pow_of_two(bytes); + + return max(ilog2(bytes), rebar_minsize) - rebar_minsize; +} +EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size); + void pci_rebar_init(struct pci_dev *pdev) { pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); diff --git a/include/linux/pci.h b/include/linux/pci.h index 34ff295cd2e3..628dda63b9e0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1419,17 +1419,13 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_release_resource(struct pci_dev *dev, int resno); -static inline int pci_rebar_bytes_to_size(u64 bytes) -{ - bytes = roundup_pow_of_two(bytes); - - /* Return BAR size as defined in the resizable BAR specification */ - return max(ilog2(bytes), 20) - 20; -} +/* Resizable BAR related routines */ +int pci_rebar_bytes_to_size(u64 bytes); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); + int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From a337869885083131e575c6367c679f4da4b68bb0 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:45 +0200 Subject: PCI: Move pci_rebar_size_to_bytes() and export it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_rebar_size_to_bytes() is in drivers/pci/pci.h but would be useful for endpoint drivers as well. Move the function to rebar.c and export it. In addition, convert the literal to where the number comes from (PCI_REBAR_MIN_SIZE). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-4-ilpo.jarvinen@linux.intel.com --- drivers/pci/pci.h | 4 ---- drivers/pci/rebar.c | 12 ++++++++++++ include/linux/pci.h | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 41df35920632..a1e7dbeb0f2c 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -1024,10 +1024,6 @@ void pci_rebar_init(struct pci_dev *pdev); void pci_restore_rebar_state(struct pci_dev *pdev); int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); -static inline u64 pci_rebar_size_to_bytes(int size) -{ - return 1ULL << (size + 20); -} struct device_node; diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index f29810fe0a58..a81cb3fcddef 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -35,6 +35,18 @@ int pci_rebar_bytes_to_size(u64 bytes) } EXPORT_SYMBOL_GPL(pci_rebar_bytes_to_size); +/** + * pci_rebar_size_to_bytes - Convert encoded BAR Size to size in bytes + * @size: encoded BAR Size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: BAR size in bytes + */ +resource_size_t pci_rebar_size_to_bytes(int size) +{ + return 1ULL << (size + ilog2(PCI_REBAR_MIN_SIZE)); +} +EXPORT_SYMBOL_GPL(pci_rebar_size_to_bytes); + void pci_rebar_init(struct pci_dev *pdev) { pdev->rebar_cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_REBAR); diff --git a/include/linux/pci.h b/include/linux/pci.h index 628dda63b9e0..33b27e0c4f3e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1422,6 +1422,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); +resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bb1fabd0d94efc29f88f86fb996c40ac06db3669 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:47 +0200 Subject: PCI: Add pci_rebar_size_supported() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many callers of pci_rebar_get_possible_sizes() are interested in finding out if a particular encoded BAR Size (PCIe r7.0, sec 7.8.6.3) is supported by the particular BAR. Add pci_rebar_size_supported() into PCI core to make it easy for the drivers to determine if the BAR size is supported or not. Use the new function in pci_resize_resource() and in pci_iov_vf_bar_set_size(). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Andi Shyti Link: https://patch.msgid.link/20251113180053.27944-6-ilpo.jarvinen@linux.intel.com --- drivers/pci/iov.c | 8 +------- drivers/pci/rebar.c | 25 +++++++++++++++++++------ include/linux/pci.h | 1 + 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 04b675e90963..71ed85d38508 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -1339,19 +1339,13 @@ EXPORT_SYMBOL_GPL(pci_sriov_configure_simple); */ int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size) { - u32 sizes; - if (!pci_resource_is_iov(resno)) return -EINVAL; if (pci_iov_is_memory_decoding_enabled(dev)) return -EBUSY; - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) + if (!pci_rebar_size_supported(dev, resno, size)) return -EINVAL; return pci_rebar_set_size(dev, resno, size); diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index 399660cb12fa..1c90b606b8d4 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -3,6 +3,7 @@ * PCI Resizable BAR Extended Capability handling. */ +#include #include #include #include @@ -124,6 +125,23 @@ u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) } EXPORT_SYMBOL(pci_rebar_get_possible_sizes); +/** + * pci_rebar_size_supported - check if size is supported for BAR + * @pdev: PCI device + * @bar: BAR to check + * @size: encoded size as defined in the PCIe spec (0=1MB, 31=128TB) + * + * Return: %true if @bar is resizable and @size is supported, otherwise + * %false. + */ +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size) +{ + u64 sizes = pci_rebar_get_possible_sizes(pdev, bar); + + return BIT(size) & sizes; +} +EXPORT_SYMBOL_GPL(pci_rebar_size_supported); + /** * pci_rebar_get_current_size - get the current size of a Resizable BAR * @pdev: PCI device @@ -252,7 +270,6 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size, { struct pci_host_bridge *host; int old, ret; - u32 sizes; /* Check if we must preserve the firmware's resource assignment */ host = pci_find_host_bridge(dev->bus); @@ -262,11 +279,7 @@ int pci_resize_resource(struct pci_dev *dev, int resno, int size, if (pci_resize_is_memory_decoding_enabled(dev, resno)) return -EBUSY; - sizes = pci_rebar_get_possible_sizes(dev, resno); - if (!sizes) - return -ENOTSUPP; - - if (!(sizes & BIT(size))) + if (!pci_rebar_size_supported(dev, resno, size)) return -EINVAL; old = pci_rebar_get_current_size(dev, resno); diff --git a/include/linux/pci.h b/include/linux/pci.h index 33b27e0c4f3e..0ef827cfaf0c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1424,6 +1424,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From 1c680f2acdbb3b64965962ca060a6daa6379575d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:50 +0200 Subject: PCI: Add pci_rebar_get_max_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pci_rebar_get_max_size() to allow simplifying code that wants to know the maximum possible size for a Resizable BAR. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-9-ilpo.jarvinen@linux.intel.com --- drivers/pci/rebar.c | 23 +++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index 1c90b606b8d4..e99b89bd5e51 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -142,6 +143,28 @@ bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size) } EXPORT_SYMBOL_GPL(pci_rebar_size_supported); +/** + * pci_rebar_get_max_size - get the maximum supported size of a BAR + * @pdev: PCI device + * @bar: BAR to query + * + * Get the largest supported size of a resizable BAR as a size. + * + * Return: the encoded maximum BAR size as defined in the PCIe spec + * (0=1MB, 31=128TB), or %-NOENT on error. + */ +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar) +{ + u32 sizes; + + sizes = pci_rebar_get_possible_sizes(pdev, bar); + if (!sizes) + return -ENOENT; + + return __fls(sizes); +} +EXPORT_SYMBOL_GPL(pci_rebar_get_max_size); + /** * pci_rebar_get_current_size - get the current size of a Resizable BAR * @pdev: PCI device diff --git a/include/linux/pci.h b/include/linux/pci.h index 0ef827cfaf0c..898bc3a4e8e7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1425,6 +1425,7 @@ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bf0a90fc907e47344f88e5b9b241082184dbac27 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 13 Nov 2025 20:00:53 +0200 Subject: PCI: Convert BAR sizes bitmasks to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0, sec 7.8.6, defines resizable BAR sizes beyond the currently supported maximum of 128TB, which will require more than u32 to store the entire bitmask. Convert Resizable BAR related functions to use u64 bitmask for BAR sizes to make the typing more future-proof. The support for the larger BAR sizes themselves is not added at this point. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-12-ilpo.jarvinen@linux.intel.com --- drivers/gpu/drm/xe/xe_vram.c | 2 +- drivers/pci/iov.c | 2 +- drivers/pci/pci-sysfs.c | 2 +- drivers/pci/rebar.c | 4 ++-- include/linux/pci.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c index 524469f8a4bd..10f8a73e190b 100644 --- a/drivers/gpu/drm/xe/xe_vram.c +++ b/drivers/gpu/drm/xe/xe_vram.c @@ -69,7 +69,7 @@ static void resize_vram_bar(struct xe_device *xe) if (!pci_rebar_size_supported(pdev, LMEM_BAR, rebar_size)) { drm_info(&xe->drm, - "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n", + "Requested size: %lluMiB is not supported by rebar sizes: 0x%llx. Leaving default: %lluMiB\n", (u64)pci_rebar_size_to_bytes(rebar_size) >> 20, pci_rebar_get_possible_sizes(pdev, LMEM_BAR), (u64)current_size >> 20); diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 71ed85d38508..00784a60ba80 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -1367,7 +1367,7 @@ EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size); u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs) { u64 vf_len = pci_resource_len(dev, resno); - u32 sizes; + u64 sizes; if (!num_vfs) return 0; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 2a1b5456c2dc..cb512bf0df7c 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1587,7 +1587,7 @@ static ssize_t __resource_resize_show(struct device *dev, int n, char *buf) pci_config_pm_runtime_get(pdev); ret = sysfs_emit(buf, "%016llx\n", - (u64)pci_rebar_get_possible_sizes(pdev, n)); + pci_rebar_get_possible_sizes(pdev, n)); pci_config_pm_runtime_put(pdev); diff --git a/drivers/pci/rebar.c b/drivers/pci/rebar.c index e99b89bd5e51..7f6dece19138 100644 --- a/drivers/pci/rebar.c +++ b/drivers/pci/rebar.c @@ -105,7 +105,7 @@ static int pci_rebar_find_pos(struct pci_dev *pdev, int bar) * Return: A bitmask of possible sizes (bit 0=1MB, bit 31=128TB), or %0 if * BAR isn't resizable. */ -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar) { int pos; u32 cap; @@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(pci_rebar_size_supported); */ int pci_rebar_get_max_size(struct pci_dev *pdev, int bar) { - u32 sizes; + u64 sizes; sizes = pci_rebar_get_possible_sizes(pdev, bar); if (!sizes) diff --git a/include/linux/pci.h b/include/linux/pci.h index 898bc3a4e8e7..4b7f4c08b5c7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1423,7 +1423,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, -- cgit v1.2.3 From f86e51399c2a911a5b01d441de513f17bf773856 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Thu, 13 Nov 2025 17:02:27 -0800 Subject: PCI/IDE: Add Address Association Register setup for downstream MMIO The address ranges for downstream Address Association Registers need to cover memory addresses for all functions (PFs/VFs/downstream devices) managed by a Device Security Manager (DSM). The proposed solution is get the memory (32-bit only) range and prefetchable-memory (64-bit capable) range from the immediate ancestor downstream port (either the direct-attach RP or deepest switch port when switch attached). Similar to RID association, address associations will be set by default if hardware sets 'Number of Address Association Register Blocks' in the 'Selective IDE Stream Capability Register' to a non-zero value. TSM drivers can opt-out of the settings by zero'ing out unwanted / unsupported address ranges. E.g. TDX Connect only supports prefetachable (64-bit capable) memory ranges for the Address Association setting. If the immediate downstream port provides both a memory range and prefetchable-memory range, but the IDE partner port only provides 1 Address Association Register block then the TSM driver can pick which range to associate, or let the PCI core prioritize memory. Note, the Address Association Register setup for upstream requests is still uncertain so is not included. Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Co-developed-by: Arto Merilainen Signed-off-by: Arto Merilainen Signed-off-by: Xu Yilun Co-developed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251114010227.567693-1-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/ide.c | 117 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/pci-ide.h | 32 +++++++++++++ include/linux/pci.h | 5 +++ 3 files changed, 145 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index da5b1acccbb4..2e0856a4307b 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -155,8 +155,11 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) { /* EP, RP, + HB Stream allocation */ struct stream_index __stream[PCI_IDE_HB + 1]; + struct pci_bus_region pref_assoc = { 0, -1 }; + struct pci_bus_region mem_assoc = { 0, -1 }; + struct resource *mem, *pref; struct pci_host_bridge *hb; - struct pci_dev *rp; + struct pci_dev *rp, *br; int num_vf, rid_end; if (!pci_is_pcie(pdev)) @@ -197,6 +200,21 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) else rid_end = pci_dev_id(pdev); + br = pci_upstream_bridge(pdev); + if (!br) + return NULL; + + /* + * Check if the device consumes memory and/or prefetch-memory. Setup + * downstream address association ranges for each. + */ + mem = pci_resource_n(br, PCI_BRIDGE_MEM_WINDOW); + pref = pci_resource_n(br, PCI_BRIDGE_PREF_MEM_WINDOW); + if (resource_assigned(mem)) + pcibios_resource_to_bus(br->bus, &mem_assoc, mem); + if (resource_assigned(pref)) + pcibios_resource_to_bus(br->bus, &pref_assoc, pref); + *ide = (struct pci_ide) { .pdev = pdev, .partner = { @@ -204,11 +222,16 @@ struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev) .rid_start = pci_dev_id(rp), .rid_end = pci_dev_id(rp), .stream_index = no_free_ptr(ep_stream)->stream_index, + /* Disable upstream address association */ + .mem_assoc = { 0, -1 }, + .pref_assoc = { 0, -1 }, }, [PCI_IDE_RP] = { .rid_start = pci_dev_id(pdev), .rid_end = rid_end, .stream_index = no_free_ptr(rp_stream)->stream_index, + .mem_assoc = mem_assoc, + .pref_assoc = pref_assoc, }, }, .host_bridge_stream = no_free_ptr(hb_stream)->stream_index, @@ -385,6 +408,63 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); } +#define SEL_ADDR1_LOWER GENMASK(31, 20) +#define SEL_ADDR_UPPER GENMASK_ULL(63, 32) +#define PREP_PCI_IDE_SEL_ADDR1(base, limit) \ + (FIELD_PREP(PCI_IDE_SEL_ADDR_1_VALID, 1) | \ + FIELD_PREP(PCI_IDE_SEL_ADDR_1_BASE_LOW, \ + FIELD_GET(SEL_ADDR1_LOWER, (base))) | \ + FIELD_PREP(PCI_IDE_SEL_ADDR_1_LIMIT_LOW, \ + FIELD_GET(SEL_ADDR1_LOWER, (limit)))) + +static void mem_assoc_to_regs(struct pci_bus_region *region, + struct pci_ide_regs *regs, int idx) +{ + /* convert to u64 range for bitfield size checks */ + struct range r = { region->start, region->end }; + + regs->addr[idx].assoc1 = PREP_PCI_IDE_SEL_ADDR1(r.start, r.end); + regs->addr[idx].assoc2 = FIELD_GET(SEL_ADDR_UPPER, r.end); + regs->addr[idx].assoc3 = FIELD_GET(SEL_ADDR_UPPER, r.start); +} + +/** + * pci_ide_stream_to_regs() - convert IDE settings to association register values + * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port + * @ide: registered IDE settings descriptor + * @regs: output register values + */ +static void pci_ide_stream_to_regs(struct pci_dev *pdev, struct pci_ide *ide, + struct pci_ide_regs *regs) +{ + struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + int assoc_idx = 0; + + memset(regs, 0, sizeof(*regs)); + + if (!settings) + return; + + regs->rid1 = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); + + regs->rid2 = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | + FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | + FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + + if (pdev->nr_ide_mem && pci_bus_region_size(&settings->mem_assoc)) { + mem_assoc_to_regs(&settings->mem_assoc, regs, assoc_idx); + assoc_idx++; + } + + if (pdev->nr_ide_mem > assoc_idx && + pci_bus_region_size(&settings->pref_assoc)) { + mem_assoc_to_regs(&settings->pref_assoc, regs, assoc_idx); + assoc_idx++; + } + + regs->nr_addr = assoc_idx; +} + /** * pci_ide_stream_setup() - program settings to Selective IDE Stream registers * @pdev: PCIe device object for either a Root Port or Endpoint Partner Port @@ -398,22 +478,34 @@ static void set_ide_sel_ctl(struct pci_dev *pdev, struct pci_ide *ide, void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide) { struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); + struct pci_ide_regs regs; int pos; - u32 val; if (!settings) return; + pci_ide_stream_to_regs(pdev, ide, ®s); + pos = sel_ide_offset(pdev, settings); - val = FIELD_PREP(PCI_IDE_SEL_RID_1_LIMIT, settings->rid_end); - pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, val); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, regs.rid1); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, regs.rid2); - val = FIELD_PREP(PCI_IDE_SEL_RID_2_VALID, 1) | - FIELD_PREP(PCI_IDE_SEL_RID_2_BASE, settings->rid_start) | - FIELD_PREP(PCI_IDE_SEL_RID_2_SEG, pci_ide_domain(pdev)); + for (int i = 0; i < regs.nr_addr; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), + regs.addr[i].assoc1); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), + regs.addr[i].assoc2); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), + regs.addr[i].assoc3); + } - pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, val); + /* clear extra unused address association blocks */ + for (int i = regs.nr_addr; i < pdev->nr_ide_mem; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0); + } /* * Setup control register early for devices that expect @@ -436,7 +528,7 @@ EXPORT_SYMBOL_GPL(pci_ide_stream_setup); void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) { struct pci_ide_partner *settings = pci_ide_to_settings(pdev, ide); - int pos; + int pos, i; if (!settings) return; @@ -444,6 +536,13 @@ void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide) pos = sel_ide_offset(pdev, settings); pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, 0); + + for (i = 0; i < pdev->nr_ide_mem; i++) { + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_1(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_2(i), 0); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_ADDR_3(i), 0); + } + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_2, 0); pci_write_config_dword(pdev, pos + PCI_IDE_SEL_RID_1, 0); settings->setup = 0; diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index d0f10f3c89fc..93194338e4d0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -28,21 +28,53 @@ enum pci_ide_partner_select { * @rid_start: Partner Port Requester ID range start * @rid_end: Partner Port Requester ID range end * @stream_index: Selective IDE Stream Register Block selection + * @mem_assoc: PCI bus memory address association for targeting peer partner + * @pref_assoc: PCI bus prefetchable memory address association for + * targeting peer partner * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of * address and RID association registers * @setup: flag to track whether to run pci_ide_stream_teardown() for this * partner slot * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + * + * By default, pci_ide_stream_alloc() initializes @mem_assoc and @pref_assoc + * with the immediate ancestor downstream port memory ranges (i.e. Type 1 + * Configuration Space Header values). Caller may zero size ({0, -1}) the range + * to drop it from consideration at pci_ide_stream_setup() time. */ struct pci_ide_partner { u16 rid_start; u16 rid_end; u8 stream_index; + struct pci_bus_region mem_assoc; + struct pci_bus_region pref_assoc; unsigned int default_stream:1; unsigned int setup:1; unsigned int enable:1; }; +/** + * struct pci_ide_regs - Hardware register association settings for Selective + * IDE Streams + * @rid1: IDE RID Association Register 1 + * @rid2: IDE RID Association Register 2 + * @addr: Up to two address association blocks (IDE Address Association Register + * 1 through 3) for MMIO and prefetchable MMIO + * @nr_addr: Number of address association blocks initialized + * + * See pci_ide_stream_to_regs() + */ +struct pci_ide_regs { + u32 rid1; + u32 rid2; + struct { + u32 assoc1; + u32 assoc2; + u32 assoc3; + } addr[2]; + int nr_addr; +}; + /** * struct pci_ide - PCIe Selective IDE Stream descriptor * @pdev: PCIe Endpoint in the pci_ide_partner pair diff --git a/include/linux/pci.h b/include/linux/pci.h index 2c8dbae4916c..ba39ca78b382 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -870,6 +870,11 @@ struct pci_bus_region { pci_bus_addr_t end; }; +static inline pci_bus_addr_t pci_bus_region_size(const struct pci_bus_region *region) +{ + return region->end - region->start + 1; +} + struct pci_dynids { spinlock_t lock; /* Protects list, index */ struct list_head list; /* For IDs added at runtime */ -- cgit v1.2.3 From 079115370d00c78ef69b31dd15def90adf2aa579 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:43 -0800 Subject: PCI/IDE: Initialize an ID for all IDE streams The PCIe spec defines two types of streams - selective and link. Each stream has an ID from the same bucket so a stream ID does not tell the type. The spec defines an "enable" bit for every stream and required stream IDs to be unique among all enabled stream but there is no such requirement for disabled streams. However, when IDE_KM is programming keys, an IDE-capable device needs to know the type of stream being programmed to write it directly to the hardware as keys are relatively large, possibly many of them and devices often struggle with keeping around rather big data not being used. Walk through all streams on a device and initialise the IDs to some unique number, both link and selective. The weakest part of this proposal is the host bridge ide_stream_ids_ida. Technically, a Stream ID only needs to be unique within a given partner pair. However, with "anonymous" / unassigned streams there is no convenient place to track the available ids. Proceed with an ida in the host bridge for now, but consider moving this tracking to be an ide_stream_ids_ida per device. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/ide.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 2 + drivers/pci/remove.c | 1 + include/linux/pci-ide.h | 6 +++ include/linux/pci.h | 1 + 5 files changed, 139 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/ide.c b/drivers/pci/ide.c index 2e0856a4307b..f0ef474e1a0d 100644 --- a/drivers/pci/ide.c +++ b/drivers/pci/ide.c @@ -35,8 +35,50 @@ static int sel_ide_offset(struct pci_dev *pdev, settings->stream_index, pdev->nr_ide_mem); } +static bool reserve_stream_index(struct pci_dev *pdev, u8 idx) +{ + int ret; + + ret = ida_alloc_range(&pdev->ide_stream_ida, idx, idx, GFP_KERNEL); + return ret >= 0; +} + +static bool reserve_stream_id(struct pci_host_bridge *hb, u8 id) +{ + int ret; + + ret = ida_alloc_range(&hb->ide_stream_ids_ida, id, id, GFP_KERNEL); + return ret >= 0; +} + +static bool claim_stream(struct pci_host_bridge *hb, u8 stream_id, + struct pci_dev *pdev, u8 stream_idx) +{ + dev_info(&hb->dev, "Stream ID %d active at init\n", stream_id); + if (!reserve_stream_id(hb, stream_id)) { + dev_info(&hb->dev, "Failed to claim %s Stream ID %d\n", + stream_id == PCI_IDE_RESERVED_STREAM_ID ? "reserved" : + "active", + stream_id); + return false; + } + + /* No stream index to reserve in the Link IDE case */ + if (!pdev) + return true; + + if (!reserve_stream_index(pdev, stream_idx)) { + pci_info(pdev, "Failed to claim active Selective Stream %d\n", + stream_idx); + return false; + } + + return true; +} + void pci_ide_init(struct pci_dev *pdev) { + struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); u16 nr_link_ide, nr_ide_mem, nr_streams; u16 ide_cap; u32 val; @@ -83,6 +125,7 @@ void pci_ide_init(struct pci_dev *pdev) int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); int nr_assoc; u32 val; + u8 id; pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); @@ -98,6 +141,51 @@ void pci_ide_init(struct pci_dev *pdev) } nr_ide_mem = nr_assoc; + + /* + * Claim Stream IDs and Selective Stream blocks that are already + * active on the device + */ + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CTL, &val); + id = FIELD_GET(PCI_IDE_SEL_CTL_ID, val); + if ((val & PCI_IDE_SEL_CTL_EN) && + !claim_stream(hb, id, pdev, i)) + return; + } + + /* Reserve link stream-ids that are already active on the device */ + for (u16 i = 0; i < nr_link_ide; ++i) { + int pos = ide_cap + PCI_IDE_LINK_STREAM_0 + i * PCI_IDE_LINK_BLOCK_SIZE; + u8 id; + + pci_read_config_dword(pdev, pos + PCI_IDE_LINK_CTL_0, &val); + id = FIELD_GET(PCI_IDE_LINK_CTL_ID, val); + if ((val & PCI_IDE_LINK_CTL_EN) && + !claim_stream(hb, id, NULL, -1)) + return; + } + + for (u16 i = 0; i < nr_streams; i++) { + int pos = __sel_ide_offset(ide_cap, nr_link_ide, i, nr_ide_mem); + + pci_read_config_dword(pdev, pos + PCI_IDE_SEL_CAP, &val); + if (val & PCI_IDE_SEL_CTL_EN) + continue; + val &= ~PCI_IDE_SEL_CTL_ID; + val |= FIELD_PREP(PCI_IDE_SEL_CTL_ID, PCI_IDE_RESERVED_STREAM_ID); + pci_write_config_dword(pdev, pos + PCI_IDE_SEL_CTL, val); + } + + for (u16 i = 0; i < nr_link_ide; ++i) { + int pos = ide_cap + PCI_IDE_LINK_STREAM_0 + + i * PCI_IDE_LINK_BLOCK_SIZE; + + pci_read_config_dword(pdev, pos, &val); + if (val & PCI_IDE_LINK_CTL_EN) + continue; + val &= ~PCI_IDE_LINK_CTL_ID; + val |= FIELD_PREP(PCI_IDE_LINK_CTL_ID, PCI_IDE_RESERVED_STREAM_ID); + pci_write_config_dword(pdev, pos, val); } pdev->ide_cap = ide_cap; @@ -301,6 +389,28 @@ void pci_ide_stream_release(struct pci_ide *ide) } EXPORT_SYMBOL_GPL(pci_ide_stream_release); +struct pci_ide_stream_id { + struct pci_host_bridge *hb; + u8 stream_id; +}; + +static struct pci_ide_stream_id * +request_stream_id(struct pci_host_bridge *hb, u8 stream_id, + struct pci_ide_stream_id *sid) +{ + if (!reserve_stream_id(hb, stream_id)) + return NULL; + + *sid = (struct pci_ide_stream_id) { + .hb = hb, + .stream_id = stream_id, + }; + + return sid; +} +DEFINE_FREE(free_stream_id, struct pci_ide_stream_id *, + if (_T) ida_free(&_T->hb->ide_stream_ids_ida, _T->stream_id)) + /** * pci_ide_stream_register() - Prepare to activate an IDE Stream * @ide: IDE settings descriptor @@ -313,6 +423,7 @@ int pci_ide_stream_register(struct pci_ide *ide) { struct pci_dev *pdev = ide->pdev; struct pci_host_bridge *hb = pci_find_host_bridge(pdev->bus); + struct pci_ide_stream_id __sid; u8 ep_stream, rp_stream; int rc; @@ -321,6 +432,13 @@ int pci_ide_stream_register(struct pci_ide *ide) return -ENXIO; } + struct pci_ide_stream_id *sid __free(free_stream_id) = + request_stream_id(hb, ide->stream_id, &__sid); + if (!sid) { + pci_err(pdev, "Setup fail: Stream ID %d in use\n", ide->stream_id); + return -EBUSY; + } + ep_stream = ide->partner[PCI_IDE_EP].stream_index; rp_stream = ide->partner[PCI_IDE_RP].stream_index; const char *name __free(kfree) = kasprintf(GFP_KERNEL, "stream%d.%d.%d", @@ -335,6 +453,9 @@ int pci_ide_stream_register(struct pci_ide *ide) ide->name = no_free_ptr(name); + /* Stream ID reservation recorded in @ide is now successfully registered */ + retain_and_null_ptr(sid); + return 0; } EXPORT_SYMBOL_GPL(pci_ide_stream_register); @@ -353,6 +474,7 @@ void pci_ide_stream_unregister(struct pci_ide *ide) sysfs_remove_link(&hb->dev.kobj, ide->name); kfree(ide->name); + ida_free(&hb->ide_stream_ids_ida, ide->stream_id); ide->name = NULL; } EXPORT_SYMBOL_GPL(pci_ide_stream_unregister); @@ -616,6 +738,8 @@ void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { hb->nr_ide_streams = 256; ida_init(&hb->ide_stream_ida); + ida_init(&hb->ide_stream_ids_ida); + reserve_stream_id(hb, PCI_IDE_RESERVED_STREAM_ID); } static ssize_t available_secure_streams_show(struct device *dev, @@ -684,3 +808,8 @@ void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr) sysfs_update_group(&hb->dev.kobj, &pci_ide_attr_group); } EXPORT_SYMBOL_NS_GPL(pci_ide_set_nr_streams, "PCI_IDE"); + +void pci_ide_destroy(struct pci_dev *pdev) +{ + ida_destroy(&pdev->ide_stream_ida); +} diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index f6ffe5ee4717..641c0b53c4e3 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -616,10 +616,12 @@ static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #ifdef CONFIG_PCI_IDE void pci_ide_init(struct pci_dev *dev); void pci_ide_init_host_bridge(struct pci_host_bridge *hb); +void pci_ide_destroy(struct pci_dev *dev); extern const struct attribute_group pci_ide_attr_group; #else static inline void pci_ide_init(struct pci_dev *dev) { } static inline void pci_ide_init_host_bridge(struct pci_host_bridge *hb) { } +static inline void pci_ide_destroy(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCI_TSM diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 803391892c4a..417a9ea59117 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -70,6 +70,7 @@ static void pci_destroy_dev(struct pci_dev *dev) up_write(&pci_bus_sem); pci_doe_destroy(dev); + pci_ide_destroy(dev); pcie_aspm_exit_link_state(dev); pci_bridge_d3_update(dev); pci_pwrctrl_unregister(&dev->dev); diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 93194338e4d0..37a1ad9501b0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -97,6 +97,12 @@ struct pci_ide { struct tsm_dev *tsm_dev; }; +/* + * Some devices need help with aliased stream-ids even for idle streams. Use + * this id as the "never enabled" place holder. + */ +#define PCI_IDE_RESERVED_STREAM_ID 255 + void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); diff --git a/include/linux/pci.h b/include/linux/pci.h index ba39ca78b382..52a235c61023 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -619,6 +619,7 @@ struct pci_host_bridge { #ifdef CONFIG_PCI_IDE u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ struct ida ide_stream_ida; + struct ida ide_stream_ids_ida; /* track unique ids per domain */ #endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); -- cgit v1.2.3 From 50cbec192f5317e29be993e2a634bbbdfcf0230e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:44 -0800 Subject: PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs After a PCIe device has established a secure link and session between a TEE Security Manager (TSM) and its local Device Security Manager (DSM), the device or its subfunctions are candidates to be bound to a private memory context, a TVM. A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). The pci_tsm_bind() requests the low-level TSM driver to associate the device with private MMIO and private IOMMU context resources of a given TVM represented by a @kvm argument. A device in the bound state corresponds to the TDISP protocol LOCKED state and awaits validation by the TVM. It is a 'struct pci_tsm_link_ops' operation because, similar to IDE establishment, it involves host side resource establishment and context setup on behalf of the guest. It is also expected to be performed lazily to allow for operation of the device in non-confidential "shared" context for pre-lock configuration. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-7-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/tsm.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/pci-tsm.h | 34 +++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c index 6a2849f77adc..39de91a47a26 100644 --- a/drivers/pci/tsm.c +++ b/drivers/pci/tsm.c @@ -270,6 +270,95 @@ static int remove_fn(struct pci_dev *pdev, void *data) return 0; } +/* + * Note, this helper only returns an error code and takes an argument for + * compatibility with the pci_walk_bus() callback prototype. pci_tsm_unbind() + * always succeeds. + */ +static int __pci_tsm_unbind(struct pci_dev *pdev, void *data) +{ + struct pci_tdi *tdi; + struct pci_tsm_pf0 *tsm_pf0; + + lockdep_assert_held(&pci_tsm_rwsem); + + if (!pdev->tsm) + return 0; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + guard(mutex)(&tsm_pf0->lock); + + tdi = pdev->tsm->tdi; + if (!tdi) + return 0; + + to_pci_tsm_ops(pdev->tsm)->unbind(tdi); + pdev->tsm->tdi = NULL; + + return 0; +} + +void pci_tsm_unbind(struct pci_dev *pdev) +{ + guard(rwsem_read)(&pci_tsm_rwsem); + __pci_tsm_unbind(pdev, NULL); +} +EXPORT_SYMBOL_GPL(pci_tsm_unbind); + +/** + * pci_tsm_bind() - Bind @pdev as a TDI for @kvm + * @pdev: PCI device function to bind + * @kvm: Private memory attach context + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + * + * Returns 0 on success, or a negative error code on failure. + * + * Context: Caller is responsible for constraining the bind lifetime to the + * registered state of the device. For example, pci_tsm_bind() / + * pci_tsm_unbind() limited to the VFIO driver bound state of the device. + */ +int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id) +{ + struct pci_tsm_pf0 *tsm_pf0; + struct pci_tdi *tdi; + + if (!kvm) + return -EINVAL; + + guard(rwsem_read)(&pci_tsm_rwsem); + + if (!pdev->tsm) + return -EINVAL; + + if (!is_link_tsm(pdev->tsm->tsm_dev)) + return -ENXIO; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + guard(mutex)(&tsm_pf0->lock); + + /* Resolve races to bind a TDI */ + if (pdev->tsm->tdi) { + if (pdev->tsm->tdi->kvm != kvm) + return -EBUSY; + return 0; + } + + tdi = to_pci_tsm_ops(pdev->tsm)->bind(pdev, kvm, tdi_id); + if (IS_ERR(tdi)) + return PTR_ERR(tdi); + + pdev->tsm->tdi = tdi; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_tsm_bind); + +static void pci_tsm_unbind_all(struct pci_dev *pdev) +{ + pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL); + __pci_tsm_unbind(pdev, NULL); +} + static void __pci_tsm_disconnect(struct pci_dev *pdev) { struct pci_tsm_pf0 *tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); @@ -278,6 +367,8 @@ static void __pci_tsm_disconnect(struct pci_dev *pdev) /* disconnect() mutually exclusive with subfunction pci_tsm_init() */ lockdep_assert_held_write(&pci_tsm_rwsem); + pci_tsm_unbind_all(pdev); + /* * disconnect() is uninterruptible as it may be called for device * teardown @@ -439,6 +530,22 @@ static struct pci_dev *find_dsm_dev(struct pci_dev *pdev) return NULL; } +/** + * pci_tsm_tdi_constructor() - base 'struct pci_tdi' initialization for link TSMs + * @pdev: PCI device function representing the TDI + * @tdi: context to initialize + * @kvm: Private memory attach context + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + */ +void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, + struct kvm *kvm, u32 tdi_id) +{ + tdi->pdev = pdev; + tdi->kvm = kvm; + tdi->tdi_id = tdi_id; +} +EXPORT_SYMBOL_GPL(pci_tsm_tdi_constructor); + /** * pci_tsm_link_constructor() - base 'struct pci_tsm' initialization for link TSMs * @pdev: The PCI device @@ -532,7 +639,7 @@ int pci_tsm_register(struct tsm_dev *tsm_dev) static void pci_tsm_fn_exit(struct pci_dev *pdev) { - /* TODO: unbind the fn */ + __pci_tsm_unbind(pdev, NULL); tsm_remove(pdev->tsm); } diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index d7b078d5e272..a5e297677917 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -6,6 +6,8 @@ struct pci_tsm; struct tsm_dev; +struct kvm; +enum pci_tsm_req_scope; /* * struct pci_tsm_ops - manage confidential links and security state @@ -29,12 +31,16 @@ struct pci_tsm_ops { * @connect: establish / validate a secure connection (e.g. IDE) * with the device * @disconnect: teardown the secure link + * @bind: bind a TDI in preparation for it to be accepted by a TVM + * @unbind: remove a TDI from secure operation with a TVM * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM + * lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -42,6 +48,9 @@ struct pci_tsm_ops { void (*remove)(struct pci_tsm *tsm); int (*connect)(struct pci_dev *pdev); void (*disconnect)(struct pci_dev *pdev); + struct pci_tdi *(*bind)(struct pci_dev *pdev, + struct kvm *kvm, u32 tdi_id); + void (*unbind)(struct pci_tdi *tdi); ); /* @@ -61,12 +70,25 @@ struct pci_tsm_ops { ); }; +/** + * struct pci_tdi - Core TEE I/O Device Interface (TDI) context + * @pdev: host side representation of guest-side TDI + * @kvm: TEE VM context of bound TDI + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + */ +struct pci_tdi { + struct pci_dev *pdev; + struct kvm *kvm; + u32 tdi_id; +}; + /** * struct pci_tsm - Core TSM context for a given PCIe endpoint * @pdev: Back ref to device function, distinguishes type of pci_tsm context * @dsm_dev: PCI Device Security Manager for link operations on @pdev * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device * Function Security operations + * @tdi: TDI context established by the @bind link operation * * This structure is wrapped by low level TSM driver data and returned by * probe()/lock(), it is freed by the corresponding remove()/unlock(). @@ -82,6 +104,7 @@ struct pci_tsm { struct pci_dev *pdev; struct pci_dev *dsm_dev; struct tsm_dev *tsm_dev; + struct pci_tdi *tdi; }; /** @@ -139,6 +162,10 @@ int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, size_t req_sz, void *resp, size_t resp_sz); +int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); +void pci_tsm_unbind(struct pci_dev *pdev); +void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, + struct kvm *kvm, u32 tdi_id); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -147,5 +174,12 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } +static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id) +{ + return -ENXIO; +} +static inline void pci_tsm_unbind(struct pci_dev *pdev) +{ +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c316c75d57fbb34e2305690813f4dbec9311f2b0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:45 -0800 Subject: PCI/TSM: Add pci_tsm_guest_req() for managing TDIs A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). A TDI instantiated by pci_tsm_bind() needs additional steps taken by the TVM to be accepted into the TVM's Trusted Compute Boundary (TCB) and transitioned to the RUN state. pci_tsm_guest_req() is a channel for the guest to request TDISP collateral, like Device Interface Reports, and effect TDISP state changes, like LOCKED->RUN transititions. Similar to IDE establishment and pci_tsm_bind(), these are long running operations involving SPDM message passing via the DOE mailbox. The path for a TVM to invoke pci_tsm_guest_req() is: * TSM triggers exit via guest-to-host-interface ABI (implementation specific) * VMM invokes handler (KVM handle_exit() -> userspace io) * handler issues request (userspace io handler -> ioctl() -> pci_tsm_guest_req()) * handler supplies response * VMM posts response, notifies/re-enters TVM This path is purely a transport for messages from TVM to platform TSM. By design the host kernel does not and must not care about the content of these messages. I.e. the host kernel is not in the TCB of the TVM. As this is an opaque passthrough interface, similar to fwctl, the kernel requires that implementations stay within the bounds defined by 'enum pci_tsm_req_scope'. Violation of those expectations likely has market and regulatory consequences. Out of scope requests are blocked by default. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- drivers/pci/tsm.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci-tsm.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/tsm.c b/drivers/pci/tsm.c index 39de91a47a26..5e57501f693e 100644 --- a/drivers/pci/tsm.c +++ b/drivers/pci/tsm.c @@ -353,6 +353,66 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id) } EXPORT_SYMBOL_GPL(pci_tsm_bind); +/** + * pci_tsm_guest_req() - helper to marshal guest requests to the TSM driver + * @pdev: @pdev representing a bound tdi + * @scope: caller asserts this passthrough request is limited to TDISP operations + * @req_in: Input payload forwarded from the guest + * @in_len: Length of @req_in + * @req_out: Output payload buffer response to the guest + * @out_len: Length of @req_out on input, bytes filled in @req_out on output + * @tsm_code: Optional TSM arch specific result code for the guest TSM + * + * This is a common entry point for requests triggered by userspace KVM-exit + * service handlers responding to TDI information or state change requests. The + * scope parameter limits requests to TDISP state management, or limited debug. + * This path is only suitable for commands and results that are the host kernel + * has no use, the host is only facilitating guest to TSM communication. + * + * Returns 0 on success and -error on failure and positive "residue" on success + * but @req_out is filled with less then @out_len, or @req_out is NULL and a + * residue number of bytes were not consumed from @req_in. On success or + * failure @tsm_code may be populated with a TSM implementation specific result + * code for the guest to consume. + * + * Context: Caller is responsible for calling this within the pci_tsm_bind() + * state of the TDI. + */ +ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, sockptr_t req_out, + size_t out_len, u64 *tsm_code) +{ + struct pci_tsm_pf0 *tsm_pf0; + struct pci_tdi *tdi; + int rc; + + /* Forbid requests that are not directly related to TDISP operations */ + if (scope > PCI_TSM_REQ_STATE_CHANGE) + return -EINVAL; + + ACQUIRE(rwsem_read_intr, lock)(&pci_tsm_rwsem); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &lock))) + return rc; + + if (!pdev->tsm) + return -ENXIO; + + if (!is_link_tsm(pdev->tsm->tsm_dev)) + return -ENXIO; + + tsm_pf0 = to_pci_tsm_pf0(pdev->tsm); + ACQUIRE(mutex_intr, ops_lock)(&tsm_pf0->lock); + if ((rc = ACQUIRE_ERR(mutex_intr, &ops_lock))) + return rc; + + tdi = pdev->tsm->tdi; + if (!tdi) + return -ENXIO; + return to_pci_tsm_ops(pdev->tsm)->guest_req(tdi, scope, req_in, in_len, + req_out, out_len, tsm_code); +} +EXPORT_SYMBOL_GPL(pci_tsm_guest_req); + static void pci_tsm_unbind_all(struct pci_dev *pdev) { pci_tsm_walk_fns_reverse(pdev, __pci_tsm_unbind, NULL); diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index a5e297677917..a6435aba03f9 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -3,6 +3,7 @@ #define __PCI_TSM_H #include #include +#include struct pci_tsm; struct tsm_dev; @@ -33,14 +34,15 @@ struct pci_tsm_ops { * @disconnect: teardown the secure link * @bind: bind a TDI in preparation for it to be accepted by a TVM * @unbind: remove a TDI from secure operation with a TVM + * @guest_req: marshal TVM information and state change requests * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. - * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM - * lock. + * @bind, @unbind, and @guest_req run under pci_tsm_rwsem held for read + * and the DSM lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -51,6 +53,11 @@ struct pci_tsm_ops { struct pci_tdi *(*bind)(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void (*unbind)(struct pci_tdi *tdi); + ssize_t (*guest_req)(struct pci_tdi *tdi, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code); ); /* @@ -152,6 +159,46 @@ static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) return PCI_FUNC(pdev->devfn) == 0; } +/** + * enum pci_tsm_req_scope - Scope of guest requests to be validated by TSM + * + * Guest requests are a transport for a TVM to communicate with a TSM + DSM for + * a given TDI. A TSM driver is responsible for maintaining the kernel security + * model and limit commands that may affect the host, or are otherwise outside + * the typical TDISP operational model. + */ +enum pci_tsm_req_scope { + /** + * @PCI_TSM_REQ_INFO: Read-only, without side effects, request for + * typical TDISP collateral information like Device Interface Reports. + * No device secrets are permitted, and no device state is changed. + */ + PCI_TSM_REQ_INFO = 0, + /** + * @PCI_TSM_REQ_STATE_CHANGE: Request to change the TDISP state from + * UNLOCKED->LOCKED, LOCKED->RUN, or other architecture specific state + * changes to support those transitions for a TDI. No other (unrelated + * to TDISP) device / host state, configuration, or data change is + * permitted. + */ + PCI_TSM_REQ_STATE_CHANGE = 1, + /** + * @PCI_TSM_REQ_DEBUG_READ: Read-only request for debug information + * + * A method to facilitate TVM information retrieval outside of typical + * TDISP operational requirements. No device secrets are permitted. + */ + PCI_TSM_REQ_DEBUG_READ = 2, + /** + * @PCI_TSM_REQ_DEBUG_WRITE: Device state changes for debug purposes + * + * The request may affect the operational state of the device outside of + * the TDISP operational model. If allowed, requires CAP_SYS_RAW_IO, and + * will taint the kernel. + */ + PCI_TSM_REQ_DEBUG_WRITE = 3, +}; + #ifdef CONFIG_PCI_TSM int pci_tsm_register(struct tsm_dev *tsm_dev); void pci_tsm_unregister(struct tsm_dev *tsm_dev); @@ -166,6 +213,9 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void pci_tsm_unbind(struct pci_dev *pdev); void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, struct kvm *kvm, u32 tdi_id); +ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, sockptr_t req_out, + size_t out_len, u64 *tsm_code); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -181,5 +231,13 @@ static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id static inline void pci_tsm_unbind(struct pci_dev *pdev) { } +static inline ssize_t pci_tsm_guest_req(struct pci_dev *pdev, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code) +{ + return -ENXIO; +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:04 -0700 Subject: Drivers: hv: VMBus protocol version 6.0 The confidential VMBus is supported starting from the protocol version 6.0 onwards. Provide the required definitions. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 12 ++++++++ include/hyperv/hvgdk_mini.h | 1 + include/linux/hyperv.h | 69 ++++++++++++++++++++++++++++++++------------- 4 files changed, 65 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 0b450e53161e..4a01797d4851 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -333,6 +333,8 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..3c414560fa5f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -56,6 +56,18 @@ static long __percpu *vmbus_evt; int vmbus_irq; int vmbus_interrupt; +/* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + /* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 77abddfc750e..7f730a0e54e6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -260,6 +260,7 @@ union hv_hypervisor_version_info { #define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 /* Support for the extended IOAPIC RTE format */ #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) +#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 59826c89171c..dfc516c1c719 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent( * Linux kernel. */ -#define VERSION_WS2008 ((0 << 16) | (13)) -#define VERSION_WIN7 ((1 << 16) | (1)) -#define VERSION_WIN8 ((2 << 16) | (4)) -#define VERSION_WIN8_1 ((3 << 16) | (0)) -#define VERSION_WIN10 ((4 << 16) | (0)) -#define VERSION_WIN10_V4_1 ((4 << 16) | (1)) -#define VERSION_WIN10_V5 ((5 << 16) | (0)) -#define VERSION_WIN10_V5_1 ((5 << 16) | (1)) -#define VERSION_WIN10_V5_2 ((5 << 16) | (2)) -#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) +#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN)) +#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13) +#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1) +#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4) +#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0) +#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0) +#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1) +#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0) +#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1) +#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2) +#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3) +#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -335,14 +337,22 @@ struct vmbus_channel_offer { } __packed; /* Server Flags */ -#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4 -#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10 -#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 -#define VMBUS_CHANNEL_PARENT_OFFER 0x200 -#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 -#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 +#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for the channel ring buffer. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for GPA direct packets and additional GPADLs. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004 +#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010 +#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100 +#define VMBUS_CHANNEL_PARENT_OFFER 0x0200 +#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400 +#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 struct vmpacket_descriptor { u16 type; @@ -621,6 +631,12 @@ struct vmbus_channel_relid_released { u32 child_relid; } __packed; +/* + * Used by the paravisor only, means that the encrypted ring buffers and + * the encrypted external memory are supported + */ +#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10 + struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; @@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact { struct { u8 msg_sint; u8 msg_vtl; - u8 reserved[6]; + u8 reserved[2]; + u32 feature_flags; /* VMBus version 6.0 */ }; }; u64 monitor_page1; @@ -1003,6 +1020,10 @@ struct vmbus_channel { /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; + /* The ring buffer is encrypted */ + bool co_ring_buffer; + /* The external memory is encrypted */ + bool co_external_memory; }; #define lock_requestor(channel, flags) \ @@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER); +} + +static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY); +} + static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) { return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); -- cgit v1.2.3 From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:47 +0000 Subject: static_call: allow using STATIC_CALL_TRAMP_STR() from assembly STATIC_CALL_TRAMP_STR() could not be used from .S files because static_call_types.h was not safe to include in assembly as it pulled in C types/constructs that are unavailable under __ASSEMBLY__. Make the header assembly-friendly by adding __ASSEMBLY__ checks and providing only the minimal definitions needed for assembly, so that it can be safely included by .S code. This enables emitting the static call trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly sources, to be used with 'call' instruction. Also, move a certain definitions out of __ASSEMBLY__ checks in compiler_types.h to meet the dependencies. No functional change for C compilation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/linux/compiler_types.h | 8 ++++---- include/linux/static_call_types.h | 4 ++++ tools/include/linux/static_call_types.h | 4 ++++ 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..6897d4d5cb28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,6 +11,10 @@ #define __has_builtin(x) (0) #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + #ifndef __ASSEMBLY__ /* @@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __builtin_warning(x, y...) (1) #endif /* __CHECKER__ */ -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ /* Attributes */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ -- cgit v1.2.3 From 4051a9115ad24bb9a691774730ca9c1dd56de665 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Sep 2025 22:19:10 -0400 Subject: new helper: simple_remove_by_name() simple_recursive_removal(), but instead of victim dentry it takes parent + name. Used to be open-coded in fs/fuse/control.c, but there's no need to expose the guts of that thing there and there are other potential users, so let's lift it into libfs... Acked-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/fuse/control.c | 7 +------ fs/libfs.c | 13 +++++++++++++ include/linux/fs.h | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 5247df896c5d..3dca752127ff 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -290,18 +290,13 @@ static void remove_one(struct dentry *dentry) */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - struct dentry *dentry; char name[32]; if (!fuse_control_sb || fc->no_control) return; sprintf(name, "%u", fc->dev); - dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root); - if (!IS_ERR(dentry)) { - simple_recursive_removal(dentry, remove_one); - dput(dentry); // paired with lookup_noperm_positive_unlocked() - } + simple_remove_by_name(fuse_control_sb->s_root, name, remove_one); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..d029aff41f66 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -655,6 +655,19 @@ void simple_recursive_removal(struct dentry *dentry, } EXPORT_SYMBOL(simple_recursive_removal); +void simple_remove_by_name(struct dentry *parent, const char *name, + void (*callback)(struct dentry *)) +{ + struct dentry *dentry; + + dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); + if (!IS_ERR(dentry)) { + simple_recursive_removal(dentry, callback); + dput(dentry); // paired with lookup_noperm_positive_unlocked() + } +} +EXPORT_SYMBOL(simple_remove_by_name); + /* caller holds parent directory with I_MUTEX_PARENT */ void locked_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..28bd4e8d3892 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3631,6 +3631,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern void simple_remove_by_name(struct dentry *, const char *, + void (*callback)(struct dentry *)); extern void locked_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -- cgit v1.2.3 From 1552ddc7fade1ae55af298580ef6c913b8db74bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Sep 2025 17:46:01 -0400 Subject: new helper: simple_done_creating() should be paired with simple_start_creating() - unlocks parent and drops dentry reference. Signed-off-by: Al Viro --- fs/libfs.c | 8 ++++++++ include/linux/fs.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index d029aff41f66..a033f35493d0 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2326,3 +2326,11 @@ struct dentry *simple_start_creating(struct dentry *parent, const char *name) return dentry; } EXPORT_SYMBOL(simple_start_creating); + +/* parent must have been held exclusive since simple_start_creating() */ +void simple_done_creating(struct dentry *child) +{ + inode_unlock(child->d_parent->d_inode); + dput(child); +} +EXPORT_SYMBOL(simple_done_creating); diff --git a/include/linux/fs.h b/include/linux/fs.h index 28bd4e8d3892..f5037c556f61 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3662,6 +3662,7 @@ extern int simple_fill_super(struct super_block *, unsigned long, extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); struct dentry *simple_start_creating(struct dentry *, const char *); +void simple_done_creating(struct dentry *); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); -- cgit v1.2.3 From 8a210cacf5dc2a6210ee42aeca5cd03b2400876f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:15:35 -0500 Subject: introduce a flag for explicitly marking persistently pinned dentries Some filesystems use a kinda-sorta controlled dentry refcount leak to pin dentries of created objects in dcache (and undo it when removing those). Reference is grabbed and not released, but it's not actually _stored_ anywhere. That works, but it's hard to follow and verify; among other things, we have no way to tell _which_ of the increments is intended to be an unpaired one. Worse, on removal we need to decide whether the reference had already been dropped, which can be non-trivial if that removal is on umount and we need to figure out if this dentry is pinned due to e.g. unlink() not done. Usually that is handled by using kill_litter_super() as ->kill_sb(), but there are open-coded special cases of the same (consider e.g. /proc/self). Things get simpler if we introduce a new dentry flag (DCACHE_PERSISTENT) marking those "leaked" dentries. Having it set claims responsibility for +1 in refcount. The end result this series is aiming for: * get these unbalanced dget() and dput() replaced with new primitives that would, in addition to adjusting refcount, set and clear persistency flag. * instead of having kill_litter_super() mess with removing the remaining "leaked" references (e.g. for all tmpfs files that hadn't been removed prior to umount), have the regular shrink_dcache_for_umount() strip DCACHE_PERSISTENT of all dentries, dropping the corresponding reference if it had been set. After that kill_litter_super() becomes an equivalent of kill_anon_super(). Doing that in a single step is not feasible - it would affect too many places in too many filesystems. It has to be split into a series. Here we * introduce the new flag * teach shrink_dcache_for_umount() to handle it (i.e. remove and drop refcount on anything that survives to umount with that flag still set) * teach kill_litter_super() that anything with that flag does *not* need to be unpinned. Next commits will add primitives for maintaing that flag and convert the common helpers to those. After that - a long series of per-filesystem patches converting to those primitives. Signed-off-by: Al Viro --- fs/dcache.c | 27 ++++++++++++++++++++++----- include/linux/dcache.h | 1 + 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..f2c9f4fef2a2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1511,6 +1511,15 @@ out: return ret; } +static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_PERSISTENT) { + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + } + return select_collect(_data, dentry); +} + static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1539,18 +1548,20 @@ out: } /** - * shrink_dcache_parent - prune dcache + * shrink_dcache_tree - prune dcache * @parent: parent of entries to prune + * @for_umount: true if we want to unpin the persistent ones * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry *parent) +static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); - d_walk(parent, &data, select_collect); + d_walk(parent, &data, + for_umount ? select_collect_umount : select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); @@ -1575,6 +1586,11 @@ void shrink_dcache_parent(struct dentry *parent) shrink_dentry_list(&data.dispose); } } + +void shrink_dcache_parent(struct dentry *parent) +{ + shrink_dcache_tree(parent, false); +} EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) @@ -1601,7 +1617,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { - shrink_dcache_parent(dentry); + shrink_dcache_tree(dentry, true); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); @@ -3111,7 +3127,8 @@ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { - if (d_unhashed(dentry) || !dentry->d_inode) + if (d_unhashed(dentry) || !dentry->d_inode || + dentry->d_flags & DCACHE_PERSISTENT) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..94b58655322a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -225,6 +225,7 @@ enum dentry_flags { DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ + DCACHE_PERSISTENT = BIT(27) }; #define DCACHE_MANAGED_DENTRY \ -- cgit v1.2.3 From bacdf1d70bbe2027619c7bbbe48b379a806a9678 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:38:04 -0500 Subject: primitives for maintaining persisitency * d_make_persistent(dentry, inode) - bump refcount, mark persistent and make hashed positive. Return value is a borrowed reference to dentry; it can be used until something removes persistency (at the very least, until the parent gets unlocked, but some filesystems may have stronger exclusion). * d_make_discardable() - remove persistency mark and drop reference. d_make_persistent() is similar to combination of d_instantiate(), dget() and setting flag. The only difference is that unlike d_instantiate() it accepts hashed and unhashed negatives alike. It is always called in strong locking environment (parent held exclusive, or, in some cases, dentry coming from d_alloc_name()); if we ever start using it with parent held only shared and dentry coming from d_alloc_parallel(), we'll need to copy the in-lookup logics from __d_add(). d_make_discardable() is eqiuvalent to combination of removing flag and dput(); since flag removal requires ->d_lock, there's no point trying to avoid taking that for refcount decrement as fast_dput() does. The slow path of dput() has been taken into a helper and reused in d_make_discardable() instead. Signed-off-by: Al Viro --- fs/dcache.c | 74 ++++++++++++++++++++++++++++++++++++++++---------- include/linux/dcache.h | 2 ++ 2 files changed, 61 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index f2c9f4fef2a2..3cc6c3876177 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -869,6 +869,24 @@ locked: return false; } +static void finish_dput(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(RCU) +{ + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { + spin_unlock(&dentry->d_lock); + return; + } + rcu_read_lock(); + } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); +} /* * This is dput @@ -906,22 +924,28 @@ void dput(struct dentry *dentry) rcu_read_unlock(); return; } - while (lock_for_kill(dentry)) { - rcu_read_unlock(); - dentry = __dentry_kill(dentry); - if (!dentry) - return; - if (retain_dentry(dentry, true)) { - spin_unlock(&dentry->d_lock); - return; - } - rcu_read_lock(); - } - rcu_read_unlock(); - spin_unlock(&dentry->d_lock); + finish_dput(dentry); } EXPORT_SYMBOL(dput); +void d_make_discardable(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + /* + * By the end of the series we'll add + * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT); + * here, but while object removal is done by a few common helpers, + * object creation tends to be open-coded (if nothing else, new inode + * needs to be set up), so adding a warning from the very beginning + * would make for much messier patch series. + */ + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + rcu_read_lock(); + finish_dput(dentry); +} +EXPORT_SYMBOL(d_make_discardable); + static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { @@ -1939,7 +1963,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); - spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. @@ -1952,7 +1975,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); - spin_unlock(&dentry->d_lock); } /** @@ -1976,7 +1998,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode) if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); spin_unlock(&inode->i_lock); } } @@ -1995,7 +2019,9 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* @@ -2754,6 +2780,24 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); +struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!inode); + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + __d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_PERSISTENT; + dget_dlock(dentry); + if (d_unhashed(dentry)) + __d_rehash(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + return dentry; +} +EXPORT_SYMBOL(d_make_persistent); + static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 94b58655322a..6ec4066825e3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -611,5 +611,7 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry) } void set_default_d_op(struct super_block *, const struct dentry_operations *); +struct dentry *d_make_persistent(struct dentry *, struct inode *); +void d_make_discardable(struct dentry *dentry); #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From 23cbc7a795853bc7a8d0512b7c686ef879f6e909 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Feb 2024 01:55:36 -0500 Subject: procfs: make /self and /thread_self dentries persistent ... and there's no need to remember those pointers anywhere - ->kill_sb() no longer needs to bother since kill_anon_super() will take care of them anyway and proc_pid_readdir() only wants the inumbers, which we had in a couple of static variables all along. Signed-off-by: Al Viro --- fs/proc/base.c | 6 ++---- fs/proc/internal.h | 1 + fs/proc/root.c | 14 ++++---------- fs/proc/self.c | 10 +++------- fs/proc/thread_self.c | 11 +++-------- include/linux/proc_fs.h | 2 -- 6 files changed, 13 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97..869677a26332 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3585,14 +3585,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(fs_info->proc_self); - if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(fs_info->proc_thread_self); - if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d1598576506c..c1e8eb984da8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -373,6 +373,7 @@ static inline void proc_tty_init(void) {} extern struct proc_dir_entry proc_root; extern void proc_self_init(void); +extern unsigned self_inum, thread_self_inum; /* * task_[no]mmu.c diff --git a/fs/proc/root.c b/fs/proc/root.c index 1e24e085c7d5..d8ca41d823e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -347,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); - if (!fs_info) { - kill_anon_super(sb); - return; - } - - dput(fs_info->proc_self); - dput(fs_info->proc_thread_self); - kill_anon_super(sb); - put_pid_ns(fs_info->pid_ns); - kfree_rcu(fs_info, rcu); + if (fs_info) { + put_pid_ns(fs_info->pid_ns); + kfree_rcu(fs_info, rcu); + } } static struct file_system_type proc_fs_type = { diff --git a/fs/proc/self.c b/fs/proc/self.c index b46fbfd22681..62d2c0cfe35c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum __ro_after_init; +unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; @@ -51,18 +50,15 @@ int proc_setup_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; - d_add(self, inode); + d_make_persistent(self, inode); ret = 0; - } else { - dput(self); } + dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - else - fs_info->proc_self = self; return ret; } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 0e5050d6ab64..d6113dbe58e0 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum __ro_after_init; +unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; @@ -51,19 +50,15 @@ int proc_setup_thread_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; - d_add(thread_self, inode); + d_make_persistent(thread_self, inode); ret = 0; - } else { - dput(thread_self); } + dput(thread_self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); - else - fs_info->proc_thread_self = thread_self; - return ret; } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index f139377f4b31..19d1c5e5f335 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -66,8 +66,6 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; - struct dentry *proc_self; /* For /proc/self */ - struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; -- cgit v1.2.3 From 566a414558aec1ab263ab8709fa783dfa2e34325 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 2 Oct 2025 10:00:52 -0400 Subject: svcrdma: Increase the server's default RPC/RDMA credit grant The range of commits from commit e3274026e2ec ("SUNRPC: move all of xprt handling into svc_xprt_handle()") to commit 15d39883ee7d ("SUNRPC: change the back-channel queue to lwq") enabled NFSD performance to scale better as the number of nfsd threads is increased. These commits were merged in v6.7. Now that the nfsd thread count can scale to more threads, permit individual clients to make more use of those threads. Increase the RPC/RDMA per-connection credit grant from 64 to 128 -- same as the Linux NFS client. Simple single client fio-based benchmarking so far shows only improvement, no regression. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 22704c2e5b9b..57f4fd94166a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -131,7 +131,7 @@ static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) */ enum { RPCRDMA_LISTEN_BACKLOG = 10, - RPCRDMA_MAX_REQUESTS = 64, + RPCRDMA_MAX_REQUESTS = 128, RPCRDMA_MAX_BC_REQUESTS = 2, }; -- cgit v1.2.3 From 6b3b697d65d46a0f640216a3f6c72856c159c567 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Oct 2025 09:54:53 -0400 Subject: sunrpc: allocate a separate bvec array for socket sends svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of rq_bvec as the start, but doesn't reduce the array length by one, which could lead to an array overrun. Also, rq_bvec is always rq_maxpages in length, which can be too short in some cases, since the TCP record marker consumes a slot. Fix both problems by adding a separate bvec array to the svc_sock that is specifically for sending. For TCP, make this array one slot longer than rq_maxpages, to account for the record marker. For UDP, only allocate as large an array as we need since it's limited to 64k of payload. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 3 +++ net/sunrpc/svcsock.c | 55 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 51 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 963bbe251e52..de37069aba90 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -26,6 +26,9 @@ struct svc_sock { void (*sk_odata)(struct sock *); void (*sk_owspace)(struct sock *); + /* For sends (protected by xpt_mutex) */ + struct bio_vec *sk_bvec; + /* private TCP part */ /* On-the-wire fragment header: */ __be32 sk_marker; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0cb9c4d45745..93de79020a2d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -740,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -1236,19 +1247,19 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, int ret; /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); @@ -1393,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1403,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; unsigned long pages; + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + pages = svc_serv_maxpages(serv); svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1420,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1637,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } -- cgit v1.2.3 From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Sep 2025 11:46:43 +0100 Subject: mm/thp: drop follow_devmap_pmd() default stub follow_devmap_pmd() has already been dropped by the commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). The fallback stub in the header which is now redundant, can be dropped off as well. Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Dev Jain Reviewed-by: Alistair Popple Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..fee4cf7fa300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; -- cgit v1.2.3 From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:29 +0200 Subject: mm/vmalloc: defer freeing partly initialized vm_struct __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 6 +++++- mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..1e43181369f1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d83c01caaabe..9e29dd767c41 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3687,6 +3687,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static LLIST_HEAD(pending_vm_area_cleanup); +static void cleanup_vm_area_work(struct work_struct *work) +{ + struct vm_struct *area, *tmp; + struct llist_node *head; + + head = llist_del_all(&pending_vm_area_cleanup); + if (!head) + return; + + llist_for_each_entry_safe(area, tmp, head, llnode) { + if (!area->pages) + free_vm_area(area); + else + vfree(area->addr); + } +} + +/* + * Helper for __vmalloc_area_node() to defer cleanup + * of partially initialized vm_struct in error paths. + */ +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); +static void defer_vm_area_cleanup(struct vm_struct *area) +{ + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) + schedule_work(&cleanup_vm_area); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3718,8 +3747,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3796,7 +3824,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - vfree(area->addr); + defer_vm_area_cleanup(area); return NULL; } -- cgit v1.2.3 From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:30 +0200 Subject: mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node() Make __vmalloc_area_node() respect non-blocking GFP masks such as GFP_ATOMIC and GFP_NOWAIT. - Add memalloc_apply_gfp_scope()/memalloc_restore_scope() helpers to apply a proper scope. - Apply memalloc_apply_gfp_scope()/memalloc_restore_scope() around vmap_pages_range() for page table setup. - Set "nofail" to false if a non-blocking mask is used, as they are mutually exclusive. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 52 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1e43181369f1..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9e29dd767c41..d8bcd87239b5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3716,6 +3716,42 @@ static void defer_vm_area_cleanup(struct vm_struct *area) schedule_work(&cleanup_vm_area); } +/* + * Page tables allocations ignore external GFP. Enforces it by + * the memalloc scope API. It is used by vmalloc internals and + * KASAN shadow population only. + * + * GFP to scope mapping: + * + * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() + * GFP_NOFS - memalloc_nofs_save() + * GFP_NOIO - memalloc_noio_save() + * + * Returns a flag cookie to pair with restore. + */ +unsigned int +memalloc_apply_gfp_scope(gfp_t gfp_mask) +{ + unsigned int flags = 0; + + if (!gfpflags_allow_blocking(gfp_mask)) + flags = memalloc_noreclaim_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + /* 0 - no scope applied. */ + return flags; +} + +void +memalloc_restore_scope(unsigned int flags) +{ + if (flags) + memalloc_flags_restore(flags); +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -3732,6 +3768,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ + if (!gfpflags_allow_blocking(gfp_mask)) + nofail = false; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3797,22 +3837,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * page tables allocations ignore external gfp mask, enforce it * by the scope API */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - + flags = memalloc_apply_gfp_scope(gfp_mask); do { ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); + memalloc_restore_scope(flags); if (ret < 0) { warn_alloc(gfp_mask, NULL, -- cgit v1.2.3 From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:32 +0200 Subject: kmsan: remove hard-coded GFP_KERNEL flags kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays with GFP_KERNEL, which may sleep. This is inconsistent with vmalloc() as it will support non-blocking requests later. Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use it internally for its demand. Please note, the subsequent __vmap_pages_range_noflush() still uses GFP_KERNEL and can sleep. If a caller runs under reclaim constraints, sleeping is forbidden, it must establish the appropriate memalloc scope API. Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 6 ++++-- mm/internal.h | 4 ++-- mm/kmsan/shadow.c | 6 +++--- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 26 +++++++++++++++++--------- 5 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1561fc2ff5b8..e623c8103358 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift); + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) @@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void) static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return -EINVAL; } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 55fdea199aaf..e7f554a31bb4 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order) int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift) + unsigned int page_shift, gfp_t gfp_mask) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; @@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, return 0; nr = (end - start) / PAGE_SIZE; - s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); - o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask); + o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask); if (!s_pages || !o_pages) { err = -ENOMEM; goto ret; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index cd69caf6aa8d..4f5937090590 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), - PAGE_KERNEL, pages, PAGE_SHIFT); + PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d8bcd87239b5..d7e7049e01f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -671,16 +671,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, } int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, - page_shift); + page_shift, gfp_mask); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } +static int __vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift, + gfp_t gfp_mask) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); + flush_cache_vmap(addr, end); + return err; +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map @@ -696,11 +708,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); } static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, @@ -3839,8 +3847,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, */ flags = memalloc_apply_gfp_scope(gfp_mask); do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); + ret = __vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift, nested_gfp); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); -- cgit v1.2.3 From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:33 +0200 Subject: mm: skip might_alloc() warnings when PF_MEMALLOC is set might_alloc() catches invalid blocking allocations in contexts where sleeping is not allowed. However when PF_MEMALLOC is set, the page allocator already skips reclaim and other blocking paths. In such cases, a blocking gfp_mask does not actually lead to blocking, so triggering might_alloc() splats is misleading. Adjust might_alloc() to skip warnings when the current task has PF_MEMALLOC set, matching the allocator's actual blocking behaviour. Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..a74582aed747 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } -- cgit v1.2.3 From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:28:21 +0800 Subject: mm/ksm: fix exec/fork inheritance support for prctl Patch series "ksm: fix exec/fork inheritance", v2. This series fixes exec/fork inheritance. See the detailed description of the issue below. This patch (of 2): Background ========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by prctl() so that the process's VMAs are forcibly scanned by ksmd. Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve(). Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl") fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY by adding the mm_slot to ksm_mm_head in __bprm_mm_init(). Problem ======= In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY during exec/fork can fail. For example, when the scanning frequency of ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may still fail to pass it to the newly exec'd process. This happens because ksm_execve() is executed too early in the do_execve flow (prematurely adding the new mm_struct to the ksm_mm_slot list). As a result, before do_execve completes, ksmd may have already performed a scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus clearing its MMF_VM_MERGE_ANY flag. Consequently, when the new program executes, the flag MMF_VM_MERGE_ANY inheritance missed. Root reason =========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs. Solution ======== Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot list, and its process has not yet officially started running or has not yet performed mmap/brk to allocate anonymous VMAS. Secondly, recheck MMF_VM_MERGEABLE again if a process takes MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list again. Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Signed-off-by: xu xin Cc: Stefan Roesch Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Wang Yaxin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; diff --git a/mm/ksm.c b/mm/ksm.c index cdefba633856..4f672f4f2140 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2712,8 +2712,14 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); + /* + * Only clear MMF_VM_MERGEABLE. We must not clear + * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process, + * perhaps their mm_struct has just been added to ksm_mm_slot + * list, and its process has not yet officially started running + * or has not yet performed mmap/brk to allocate anonymous VMAS. + */ mm_flags_clear(MMF_VM_MERGEABLE, mm); - mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2831,12 +2837,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma) * * Returns: @vm_flags possibly updated to mark mergeable. */ -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && - __ksm_should_add_vma(file, vm_flags)) + __ksm_should_add_vma(file, vm_flags)) { vm_flags |= VM_MERGEABLE; + /* + * Generally, the flags here always include MMF_VM_MERGEABLE. + * However, in rare cases, this flag may be cleared by ksmd who + * scans a cycle without finding any mergeable vma. + */ + if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm))) + __ksm_enter(mm); + } return vm_flags; } -- cgit v1.2.3 From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:53:04 +0100 Subject: mm: consistently use current->mm in mm_get_unmapped_area() mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() / arch_get_unmapped_area_topdown(), both of which search current->mm for some free space. Neither take an mm_struct - they implicitly operate on current->mm. But the wrapper takes an mm_struct and uses it to decide whether to search bottom up or top down. All callers pass in current->mm for this, so everything is working consistently. But it feels like an accident waiting to happen; eventually someone will call that function with a different mm, expecting to find free space in it, but what gets returned is free space in the current mm. So let's simplify by removing the parameter and have the wrapper use current->mm to decide which end to start at. Now everything is consistent and self-documenting. Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_64.c | 6 +++--- arch/x86/kernel/cpu/sgx/driver.c | 2 +- drivers/char/mem.c | 2 +- drivers/dax/device.c | 5 ++--- fs/hugetlbfs/inode.c | 3 +-- fs/proc/inode.c | 2 +- fs/ramfs/file-mmu.c | 2 +- include/linux/sched/mm.h | 9 ++++----- io_uring/memmap.c | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/huge_memory.c | 4 ++-- mm/mmap.c | 17 +++++++---------- mm/shmem.c | 8 +++----- 14 files changed, 29 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 55faf2effa46..dbf118b40601 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u align_goal = (64UL * 1024); do { - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, + addr = mm_get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); @@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u * be obtained. */ if (addr & ~PAGE_MASK) - addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags); + addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags); return addr; } diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..3b3efadb8cae 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 34b815901b20..db1ca53a6d01 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -542,7 +542,7 @@ static unsigned long get_unmapped_area_zero(struct file *file, #ifdef CONFIG_TRANSPARENT_HUGEPAGE return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); #endif } #endif /* CONFIG_MMU */ diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 2bb40a6060af..7f1ed0db8337 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -330,14 +330,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp, if ((off + len_align) < off) goto out; - addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align, - pgoff, flags); + addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags); if (!IS_ERR_VALUE(addr_align)) { addr_align += (off - addr_align) & (align - 1); return addr_align; } out: - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } static const struct address_space_operations dev_dax_aops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f42548ee9083..ce8e40d35032 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -184,8 +184,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9b7ef122343..2d3425cfa94b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif return orig_addr; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b11f5b20b78b..c3ed1c5117b2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a74582aed747..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/io_uring/memmap.c b/io_uring/memmap.c index add03ca75cb9..63fcfa757bb8 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -387,7 +387,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, #else addr = 0UL; #endif - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 1074ac4459f2..872dc0e41c65 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -334,7 +334,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad return -EINVAL; } - ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); + ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..d77685f2c6cb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1162,7 +1162,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(filp, addr, len, pgoff, flags); #else return addr; #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2a521e5d68..32479ae27400 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1127,7 +1127,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, if (len_pad < len || (off + len_pad) < off) return 0; - ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, + ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* @@ -1164,7 +1164,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add if (ret) return ret; - return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, + return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); } diff --git a/mm/mmap.c b/mm/mmap.c index 5fd3b80fda1d..644f02071a41 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, } #endif -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, - vm_flags_t vm_flags) +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { - if (mm_flags_test(MMF_TOPDOWN, mm)) + if (mm_flags_test(MMF_TOPDOWN, current->mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { - addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, + addr = mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) @@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } unsigned long -mm_get_unmapped_area(struct mm_struct *mm, struct file *file, - unsigned long addr, unsigned long len, +mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area_vmflags(mm, file, addr, len, - pgoff, flags, 0); + return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); diff --git a/mm/shmem.c b/mm/shmem.c index 58701d14dd96..0eecb486a0cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2759,8 +2759,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (len > TASK_SIZE) return -ENOMEM; - addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, - flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2838,8 +2837,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (inflated_len < len) return addr; - inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, - inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) @@ -5775,7 +5773,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -- cgit v1.2.3 From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:03 +0500 Subject: kasan: cleanup of kasan_enabled() checks Deduplication of kasan_enabled() checks which are already used by callers. * Altered functions: check_page_allocation Delete the check because callers have it already in __wrappers in include/linux/kasan.h: __kasan_kfree_large __kasan_mempool_poison_pages __kasan_mempool_poison_object kasan_populate_vmalloc, kasan_release_vmalloc Add __wrappers in include/linux/kasan.h. They are called externally in mm/vmalloc.c. __kasan_unpoison_vmalloc, __kasan_poison_vmalloc Delete checks because there're already kasan_enabled() checks in respective __wrappers in include/linux/kasan.h. release_free_meta -- Delete the check because the higher caller path has it already. See the stack trace: __kasan_slab_free -- has the check already __kasan_mempool_poison_object -- has the check already poison_slab_object kasan_save_free_info release_free_meta kasan_enabled() -- Delete here Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 20 ++++++++++++++++++-- mm/kasan/common.c | 3 --- mm/kasan/generic.c | 3 --- mm/kasan/shadow.c | 20 ++++---------------- 4 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d4c14359feaf..22e5d67ff064 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_enabled()) - return false; - if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 516b49accc4f..2b8e73f5f6a7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_enabled()) - return; - /* Check if free meta is valid. */ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META) return; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index a30d84bfdd52..29a751a8a08d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) +static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; @@ -395,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_ return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; - if (!kasan_enabled()) - return 0; - if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -424,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); + ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask); if (ret) return ret; @@ -566,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * pages entirely covered by the free region, we will not run in to any * trouble - any simultaneous allocations will be for disjoint regions. */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags) @@ -575,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_enabled()) - return; - region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -626,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_enabled()) - return (void *)start; - if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -651,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_enabled()) - return; - if (!is_vmalloc_or_module_addr(start)) return; -- cgit v1.2.3 From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:16 +0530 Subject: drivers/base/node: fold register_node() into register_one_node() Patch series "drivers/base/node: fold node register and unregister functions", v2. The first patch merges register_one_node() and register_node(), leaving a single register_node() function. The second patch merges unregister_one_node() and unregister_node(), leaving a single unregister_node() function. There are no functional changes in these patches. This patch (of 2): register_node() is only called from register_one_node(). This patch folds register_node() into its only caller and renames register_one_node() to register_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [akpm@linux-foundation.org: fix kerneldoc, per David] Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- arch/x86/mm/numa.c | 4 +-- drivers/base/node.c | 52 ++++++++++++------------------ include/linux/node.h | 4 +-- mm/memory_hotplug.c | 4 +-- mm/mm_init.c | 2 +- 6 files changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index aeb8633a3d00..8c77ec7980de 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) nid = of_node_to_nid(dn); if (likely((nid) >= 0)) { if (!node_online(nid)) { - if (register_one_node(nid)) { + if (register_node(nid)) { pr_err("PCI: Failed to register node %d\n", nid); } else { update_numa_distance(dn); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c24890c40138..7a97327140df 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -262,7 +262,7 @@ void __init init_gi_nodes(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ @@ -303,7 +303,7 @@ void __init init_cpu_to_node(void) * bringup_nonboot_cpus * cpu_up * __try_online_node - * register_one_node + * register_node * because node_subsys is not initialized yet. * TODO remove dependency on node_online */ diff --git a/drivers/base/node.c b/drivers/base/node.c index 83aeb0518e1d..17d7b90403ff 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,33 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/* - * register_node - Setup a sysfs device for a node. - * @num - Node number to use when creating the device. - * - * Initialize and register the node device. - */ -static int register_node(struct node *node, int num) -{ - int error; - - node->dev.id = num; - node->dev.bus = &node_subsys; - node->dev.release = node_device_release; - node->dev.groups = node_dev_groups; - error = device_register(&node->dev); - - if (error) { - put_device(&node->dev); - } else { - hugetlb_register_node(node); - compaction_register_node(node); - reclaim_register_node(node); - } - - return error; -} - /** * unregister_node - unregister a node device * @node: node going away @@ -907,7 +880,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn, } #endif /* CONFIG_MEMORY_HOTPLUG */ -int register_one_node(int nid) +/** + * register_node - Initialize and register the node device. + * @nid: Node number to use when creating the device. + * + * Return: 0 on success, -errno otherwise + */ +int register_node(int nid) { int error; int cpu; @@ -918,14 +897,23 @@ int register_one_node(int nid) return -ENOMEM; INIT_LIST_HEAD(&node->access_list); - node_devices[nid] = node; - error = register_node(node_devices[nid], nid); + node->dev.id = nid; + node->dev.bus = &node_subsys; + node->dev.release = node_device_release; + node->dev.groups = node_dev_groups; + + error = device_register(&node->dev); if (error) { - node_devices[nid] = NULL; + put_device(&node->dev); return error; } + node_devices[nid] = node; + hugetlb_register_node(node); + compaction_register_node(node); + reclaim_register_node(node); + /* link cpu under this node */ for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) @@ -1018,7 +1006,7 @@ void __init node_dev_init(void) * to already created cpu devices. */ for_each_online_node(i) { - ret = register_one_node(i); + ret = register_node(i); if (ret) panic("%s() failed to add node: %d\n", __func__, ret); } diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..b7028d3ec3b4 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); +int register_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0be83039c3b5..6c050d867031 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1311,7 +1311,7 @@ static int __try_online_node(int nid, bool set_node_online) if (set_node_online) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); BUG_ON(ret); } out: @@ -1542,7 +1542,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error_memblock_remove; if (ret) { node_set_online(nid); - ret = register_one_node(nid); + ret = register_node(nid); if (WARN_ON(ret)) { node_set_offline(nid); goto error_memblock_remove; diff --git a/mm/mm_init.c b/mm/mm_init.c index 7712d887b696..c6812b4dbb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) free_area_init_node(nid); /* - * No sysfs hierarchy will be created via register_one_node() + * No sysfs hierarchy will be created via register_node() *for memory-less node because here it's not marked as N_MEMORY *and won't be set online later. The benefit is userspace *program won't be confused by sysfs files/directories of -- cgit v1.2.3 From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:17 +0530 Subject: drivers/base/node: fold unregister_node() into unregister_one_node() unregister_node() is only called from unregister_one_node(). This patch folds unregister_node() into its only caller and renames unregister_one_node() to unregister_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [donettom@linux.ibm.com: remove extra spaces before @nid and "All"] Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- drivers/base/node.c | 38 +++++++++++++++++--------------------- include/linux/node.h | 6 ++---- mm/memory_hotplug.c | 4 ++-- 3 files changed, 21 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 17d7b90403ff..00cf4532f121 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -676,23 +676,6 @@ static void node_device_release(struct device *dev) kfree(to_node(dev)); } -/** - * unregister_node - unregister a node device - * @node: node going away - * - * Unregisters a node device @node. All the devices on the node must be - * unregistered before calling this function. - */ -void unregister_node(struct node *node) -{ - hugetlb_unregister_node(node); - compaction_unregister_node(node); - reclaim_unregister_node(node); - node_remove_accesses(node); - node_remove_caches(node); - device_unregister(&node->dev); -} - struct node *node_devices[MAX_NUMNODES]; /* @@ -924,13 +907,26 @@ int register_node(int nid) return error; } - -void unregister_one_node(int nid) +/** + * unregister_node - unregister a node device + * @nid: nid of the node going away + * + * Unregisters the node device at node id @nid. All the devices on the + * node must be unregistered before calling this function. + */ +void unregister_node(int nid) { - if (!node_devices[nid]) + struct node *node = node_devices[nid]; + + if (!node) return; - unregister_node(node_devices[nid]); + hugetlb_unregister_node(node); + compaction_unregister_node(node); + reclaim_unregister_node(node); + node_remove_accesses(node); + node_remove_caches(node); + device_unregister(&node->dev); node_devices[nid] = NULL; } diff --git a/include/linux/node.h b/include/linux/node.h index b7028d3ec3b4..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ int register_node(int nid); -extern void unregister_one_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -193,7 +191,7 @@ static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c050d867031..94a8f6e8811a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1596,7 +1596,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) error: if (new_node) { node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) @@ -2201,7 +2201,7 @@ void try_offline_node(int nid) * node now. */ node_set_offline(nid); - unregister_one_node(nid); + unregister_node(nid); } EXPORT_SYMBOL(try_offline_node); -- cgit v1.2.3 From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:08 -0700 Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. Motivation & Approach ===================== While testing workloads with high sustained memory pressure on large machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly high number of softlockups. Further investigation showed that the zone lock in free_pcppages_bulk was being held for a long time, and was called to free 2k+ pages over 100 times just during boot. This causes starvation in other processes for the zone lock, which can lead to the system stalling as multiple threads cannot make progress without the locks. We can see these issues manifesting as warnings: [ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU [ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 [ 4512.626401] rcu: hardirqs softirqs csw/system [ 4512.638793] rcu: number: 0 145 0 [ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) [ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) While these warnings don't indicate a crash or a kernel panic, they do point to the underlying issue of lock contention. To prevent starvation in both locks, batch the freeing of pages using pcp->batch. Because free_pcppages_bulk is called with the pcp lock and acquires the zone lock, relinquishing and reacquiring the locks are only effective when both of them are broken together (unless the system was built with queued spinlocks). Thus, instead of modifying free_pcppages_bulk to break both locks, batch the freeing from its callers instead. A similar fix has been implemented in the Meta fleet, and we have seen significantly less softlockups. Testing ======= The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation). Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. Based on these results, it seems like there are varying degrees to how much lock contention this reduces. For the largest and smallest machines that I ran the tests on, it seems like there is quite some significant reduction. There is also some performance increases visible from userspace. Interestingly, the performance gains don't scale with the size of the machine, but rather there seems to be a dip in the gain there is for the medium-sized machine. One possible theory is that because the high watermark depends on both memory and the number of local CPUs, what impacts zone contention the most is not these individual values, but rather the ratio of mem:processors. This patch (of 5): Currently, refresh_cpu_vm_stats returns an int, indicating how many changes were made during its updates. Using this information, callers like vmstat_update can heuristically determine if more work will be done in the future. However, all of refresh_cpu_vm_stats's callers either (a) ignore the result, only caring about performing the updates, or (b) only care about whether changes were made, but not *how many* changes were made. Simplify the code by returning a bool instead to indicate if updates were made. In addition, simplify fold_diff and decay_pcp_high to return a bool for the same reason. Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: Vlastimil Babka Reviewed-by: SeongJae Park Cc: Brendan Jackman Cc: Chris Mason Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- mm/page_alloc.c | 8 ++++---- mm/vmstat.c | 28 +++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10a908793b4c..f057ce5ea7da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2557,10 +2557,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, batch; - int todo = 0; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,7 +2573,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; @@ -2581,7 +2581,7 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; } return todo; diff --git a/mm/vmstat.c b/mm/vmstat.c index bb09c032eecf..98855f31294d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -771,25 +771,25 @@ EXPORT_SYMBOL(dec_node_page_state); /* * Fold a differential into the global counters. - * Returns the number of counters updated. + * Returns whether counters were updated. */ static int fold_diff(int *zone_diff, int *node_diff) { int i; - int changes = 0; + bool changed = false; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (zone_diff[i]) { atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; + changed = true; } for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + changed = true; } - return changes; + return changed; } /* @@ -806,16 +806,16 @@ static int fold_diff(int *zone_diff, int *node_diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. * - * The function returns the number of global counters updated. + * The function returns whether global counters were updated. */ -static int refresh_cpu_vm_stats(bool do_pagesets) +static bool refresh_cpu_vm_stats(bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; - int changes = 0; + bool changed = false; for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; @@ -839,7 +839,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) if (do_pagesets) { cond_resched(); - changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); + if (decay_pcp_high(zone, this_cpu_ptr(pcp))) + changed = true; #ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this @@ -861,13 +862,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } if (__this_cpu_dec_return(pcp->expire)) { - changes++; + changed = true; continue; } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); - changes++; + changed = true; } #endif } @@ -887,8 +888,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } - changes += fold_diff(global_zone_diff, global_node_diff); - return changes; + if (fold_diff(global_zone_diff, global_node_diff)) + changed = true; + return changed; } /* -- cgit v1.2.3 From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 16 Oct 2025 09:10:35 -0700 Subject: memcg: net: track network throttling due to memcg memory pressure The kernel can throttle network sockets if the memory cgroup associated with the corresponding socket is under memory pressure. The throttling actions include clamping the transmit window, failing to expand receive or send buffers, aggressively prune out-of-order receive queue, FIN deferred to a retransmitted packet and more. Let's add memcg metric to track such throttling actions. At the moment memcg memory pressure is defined through vmpressure and in future it may be defined using PSI or we may add more flexible way for the users to define memory pressure, maybe through ebpf. However the potential throttling actions will remain the same, so this newly introduced metric will continue to track throttling actions irrespective of how memcg memory pressure is defined. Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Daniel Sedlak Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Neal Cardwell Cc: Paolo Abeni Cc: Simon Horman Cc: Tejun Heo Cc: Willem de Bruijn Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ include/linux/memcontrol.h | 1 + include/net/sock.h | 6 +++++- kernel/cgroup/cgroup.c | 1 + mm/memcontrol.c | 3 +++ 5 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0e6c67ac585a..3345961c30ac 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1515,6 +1515,10 @@ The following nested keys are defined. oom_group_kill The number of times a group OOM has occurred. + sock_throttled + The number of times network sockets associated with + this cgroup are throttled. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..5fe254813123 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..ff7d49af1619 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) #endif /* CONFIG_MEMCG_V1 */ do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); return true; + } } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..8df671c59987 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4704,6 +4704,7 @@ void cgroup_file_notify(struct cgroup_file *cfile) } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_file_notify); /** * cgroup_file_show - show or hide a hidden cgroup file diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ae5cbcaed75..976412c8196e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); struct mem_cgroup *root_mem_cgroup __read_mostly; +EXPORT_SYMBOL(root_mem_cgroup); /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); @@ -4463,6 +4464,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "sock_throttled %lu\n", + atomic_long_read(&events[MEMCG_SOCK_THROTTLED])); } static int memory_events_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:07 +0800 Subject: mm: vmscan: simplify the logic for activating dirty file folios After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios through reclaim. However, in the shrink_folio_list() function, there still remains some logic related to writeback control of dirty file folios. The original logic was that, for direct reclaim, or when folio_test_reclaim() is false, or the PGDAT_DIRTY flag is not set, the dirty file folios would be directly activated to avoid being scanned again; otherwise, it will try to writeback the dirty file folios. However, since we can no longer perform writeback on dirty folios, the dirty file folios will still be activated. Additionally, under the original logic, if we continue to try writeback dirty file folios, we will also check the references flag, sc->may_writepage, and may_enter_fs(), which may result in dirty file folios being left in the inactive list. This is unreasonable. Even if these dirty folios are scanned again, we still cannot clean them. Therefore, the checks on these dirty file folios appear to be redundant and can be removed. Dirty file folios should be directly moved to the active list to avoid being scanned again. Since we set the PG_reclaim flag for the dirty folios, once the writeback is completed, they will be moved back to the tail of the inactive list to be retried for quick reclaim. Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ---- mm/vmscan.c | 25 +++---------------------- 2 files changed, 3 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e53ac12cc802..ecc90517b791 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1409,21 +1409,7 @@ retry: mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { - /* - * Only kswapd can writeback filesystem folios - * to avoid risk of stack overflow. But avoid - * injecting inefficient single-folio I/O into - * flusher writeback as much as possible: only - * write folios when we've encountered many - * dirty folios, and when we've already scanned - * the rest of the LRU for clean folios and see - * the same dirty folios again (with the reclaim - * flag set). - */ - if (folio_is_file_lru(folio) && - (!current_is_kswapd() || - !folio_test_reclaim(folio) || - !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + if (folio_is_file_lru(folio)) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() @@ -1432,7 +1418,8 @@ retry: */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); - folio_set_reclaim(folio); + if (!folio_test_reclaim(folio)) + folio_set_reclaim(folio); goto activate_locked; } @@ -6127,11 +6114,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty && - sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it @@ -6872,7 +6854,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat) clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); - clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit v1.2.3 From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:53 -0700 Subject: mm/damon: document damos_quota_goal->nid use case Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage". Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup per-NUMA node memory utilization. Expected use cases are cgroup level access-aware NUMA memory managements, such as memory tiering or proactive reclamation on cgroup-based multi-tenant NUMA systems. Background ========== The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly recommended way for modern DAMOS use cases. Using it, users can specify what system status they want to achieve with what access-aware system operations. For example, reclaim cold memory aiming for 0.5 percent of memory pressure (proactive reclaim), or migrate hot and cold memory between NUMA nodes having different speed (memory tiering). Then DAMOS automatically adjusts the aggressiveness of the system operation (e.g., increase/decrease reclaim target coldness threshold) based on current status of the system. The use case is limited by the supported system status metrics for specifying the target system status. Two new system metrics for per-node memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were recently added to extend the use cases for access-aware NUMA nodes management, such as memory tiering. Those are expected to be useful for not only memory tiering but also general access-aware inter-NUMA node page migration, though. Limitation ---------- The per-node memory usage based auto-tuning can be applied only system-wide. For cgroups-based multi-tenant systems, it could arguably harm the fairness. For example, a cgroup may use faster NUMA node memory more than other cgroup, depending on their access pattern. If the user of each cgroup are promised to get the same quality and amount of the system resource, this can arguably be an unfair situation. DAMOS supports cgroup level system operations via DAMOS filter. But the quota auto-tuning system is not aware of cgroups. New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage =================================================================== To overcome the limitation, introduce two new DAMOS quota auto-tuning goal metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Those can be thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that extended for cgroups. The two metrics specifies per-cgroup, per-node amount of used and unused memory in ratio to the total memory of the node. For example, let's assume a system has two NUMA nodes of size 100 GiB and 50 GiB. And two cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node 1, respectively, as illustrated by the below table. node-0 node-1 Total memory 100 GiB 50 GiB Cgroup A usage 40 GiB 20 GiB Cgroup B usage 60 GiB 10 GiB Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000 bp (60 percent), respectively. Those for the second node are, 20 GiB / 50 GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent), respectively. DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB = 6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB / 50 GiB = 8000 bp, respectively. DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp Using these, users can specify how much [un]used amount of memory for per-cgroup and per-node DAMOS should make as a result of the auto-tuning. Example Usecase: Cgroup Level Memory Tiering ============================================ Let's suppose a typical and simple tiered memory system. The system equips two NUMA nodes. The first node (node 0) is CPU-attached and fast. The second node (node 1) is CPU-unattached and slow. It runs two cgroups that desire to use about 30 percent and 70 percent of the faster node as much as possible for their hot data, respectively. Then, the user can implement DAMOS-based memory tiering for the system using the DAMON user-space tool (damo), like below. # ./damo start \ `# kdamond for node 1 (slow)` \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ `# promotion scheme for cgroup a` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \ \ `# promotion scheme for cgroup b` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/b \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \ \ `# kdamond for node 0 (fast)` \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ `# demotion scheme for cgroup a` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 70.5% 0 \ \ `# demotion scheme for cgroup b` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 30.5% 0 \ \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 With the command, the user-space tool will ask DAMON to spawn two kernel threads, each for monitoring accesses to node 1 (slow) and node 0 (fast), respectively. It installs two DAMOS schemes on each thread. Let's call them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup a/b" in the order. The promotion schemes are installed on the DAMON thread for node 1 (slow), and demotion schemes are installed on the DAMON thread for node 0 (fast). Cgroup Level Hot Pages Migration (Promotion) -------------------------------------------- Promotion schemes will find memory regions on node 1 (slow), that some access was detected. The schemes will then migrate the found memory to node 0 (fast), hottest pages first. For accurate and effective migration, these schemes use two page level filters. First, the migration will be filtered for only cgroup A and cgroup B. That is, "promotion scheme for cgroup B" will not do the migration if the page is for cgroup A. Secondly, the schemes will ignore pages that having their page table's Accessed bits unset. The per-page Accessed bit check logic will also unset the bit if it was set, for the next check. For controlled amounts of system resource consumption and aiming on the target memory usage, the schemes use quotas setup. The migration is limited to be done only up to 200 MiB per second, to limit the peak system resource usage. And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for 29.7% and 69.7% of node 0 (fast), respectively. The target value is lower than the high level goal (30% and 70% system memory), to give headroom on node 0 (fast). DAMOS will adjust the speed of the pages migration based on the target and current per-cgroup node 0 memory usage. For example, if cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second. If cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages migration from node 1 to node 0 will be slowed and eventually stopped. Cgroup Level Cold Pages Migration (Demotion) -------------------------------------------- Demotion schemes are similar to promotion schemes, but differ in filtering setup and quota tuning setup. Those filter out pages having their page table Accessed bits set. And set 70.5% and 30.5% of node 0 memory free rate for the cgroup A and B, respectively. Hence, if promotion schemes or something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0, demotion schemes will start migrating cold pages of appropriate cgroups in node 0 to node 1, under the 200 MiB per second speed cap, while adjusting the speed based on how much more than wanted memory is being used. The quota target values are set to overlap with promotion targets, to keep a minimum level of page exchanges between the nodes. This is to avoid a case that the target memory utilization is met, and then access pattern changes (pages in node 1 become hotter than pages in node 0) while the memory utilization is unchanged. Without the overlap, neither promotion of hotter pages in node 1, nor demotion of colder pages in node 0 will happen since both goals are met. As a result, the faster and slower node will unexpectedly serve cold and hot data. Test: Per-cgroup Memory Tiering =============================== I ran a simplified cgroup level memory tiering using the feature, and confirmed it works as intended. Setup ----- I configured a QEMU virtual machine representing a simplified version of the system that described on the above cgroup level memory tiering example use case. The system equips 40 CPU cores and two NUMA nodes each having 30 GiB physical memory. The first node (node 0) represents the faster NUMA node, and the second node (node 1) represents the slower NUMA node. In specific, below qemu command line options are used. [...] -object memory-backend-ram,size=30G,id=m0 \ -object memory-backend-ram,size=30G,id=m1 \ -numa node,cpus=0-39,memdev=m0 \ -numa node,memdev=m1 \ [...] I booted the virtual machine with a kernel that this patch series is applied. On the virtual machine, I created two cgroups, namely workload_a and workload_b. And ran a test program in each cgroup, resulting in one process per cgroup. The test program allocates 10 GiB memory and evenly split it into 10 regions. After the allocation, it repeatedly access the first region for one minute, than the second one for one minute, and so on. After the one minute repeated access for the 10-th region is done, it repeats the access from the first region. So the process has 10 GiB of data in total, but only 1 GiB of it is hot at a given moment, and the hot data is gradually changed. While the processes are running, run DAMON for a simple access-aware memory tiering using below script. It migrates hot and cold data of the cgroups into node 0 and node 1, aiming the first and the second cgroups (workload_a and workload_b, respectively) utilizing about 9.7 percent and 19.7 percent of node 0, respectively. Note that this setup is a simplified version of the above example use case, for ease of test. Also note that we assigned 30 GiB physical memory to node 0, but DAMON in this setup works for only 27 GiB of the memory. It is due to an internal implementation detail of DAMON user-space tool that not really important for this test. #!/bin/bash damo start \ --numa_node 1 \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \ --numa_node 0 \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 After starting DAMON, the pages continuously be migrated across nodes. A few minutes later, the memory usage of the cgroups converges into the aimed amounts, and keeps the level, as expected. To confirm the status is kept in the target level as expected, I collected the memory usage stat of the cgroups using memory.numa_stat file, after the stats are converged. I repeat the stat collection 42 times with 5 seconds delay between each of the collections. The results are as below: node0_memory_usage average stdev workload_a 2.79GiB 522.06MiB workload_b 5.15GiB 739.10MiB The average values are quite close to the targeted values: 27 GiB * 9.7% = 2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB. A level of variances are expected, given the overlap of the promotion/demotion targets, and dynamic data access pattern of the workloads. Give that, the measured variances are at a reasonable level. Patches Sequence ================ The first patch (patch 1) updates the kernel-doc comment of damos_quota_goal struct to clarify usage of optional fields of the struct, since later patches will add such optional fields. Following four patches (patches 2-5) implement a new DAMOS quota goal metric for per-cgroup per-node memory usage. Those extends the core layer interface for the new metric (patch 2), implement the metric value calculation on the core layer (patch 3), add DAMON sysfs interface file for the target cgroup specification (patch 4), and implement support of the new metric on DAMON sysfs interface (patch 5). Next two patches implment the second new DAMOS quota goal metric for per-cgroup per-node free (or, unused) memory. Those implement it in the core layer (patch 6) and DAMON sysfs interface (patch 7), extending the existing implementation for memory usage metric. Final three patches update the design (patch 8), the usage (patch 9), and the ABI (patch 10) documents for the changes that are introduced by this patch series. This patch (of 10): damos_quota_goal kerneldoc comment is not explaining when @metric is used. Update the comment for that. Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..dc9c310e0e75 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -176,6 +176,9 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; -- cgit v1.2.3 From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:54 -0700 Subject: mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node memory usage. For specifying the cgroup of the interest, add a field, namely memcg_id, to damos_quota_goal struct. Note that this commit is only implementing the interface. The handling of the interface (the metric value calculation) will be implemented in the following commit. Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index dc9c310e0e75..0d63ceb7e6ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -147,6 +147,7 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +157,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +168,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +182,9 @@ enum damos_quota_goal_metric { * * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents + * the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -187,7 +193,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; -- cgit v1.2.3 From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:58 -0700 Subject: mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory portion. The value of the metric is implemented as the entire memory of the given NUMA node subtracted by the given cgroup's usage. So from a perspective, "unused" could be a better term than "free". But arguably it is not very clear what is better, so use the term "free". Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- mm/damon/core.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0d63ceb7e6ef..0edf41d36ea1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -148,6 +148,7 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -158,6 +159,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -183,8 +185,8 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. * - * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents - * the node id and the cgroup to account the used memory for. + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; diff --git a/mm/damon/core.c b/mm/damon/core.c index 8aa8d269df90..a9c11d2d37b0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -790,6 +790,7 @@ static void damos_commit_quota_goal_union( dst->nid = src->nid; break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: dst->nid = src->nid; dst->memcg_id = src->memcg_id; break; @@ -2046,7 +2047,7 @@ static unsigned long damos_get_node_memcg_used_bp( { struct mem_cgroup *memcg; struct lruvec *lruvec; - unsigned long used_pages; + unsigned long used_pages, numerator; struct sysinfo i; rcu_read_lock(); @@ -2066,7 +2067,11 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); si_meminfo_node(&i, goal->nid); - return used_pages * 10000 / i.totalram; + if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) + numerator = used_pages; + else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ + numerator = i.totalram - used_pages; + return numerator * 10000 / i.totalram; } #else static __kernel_ulong_t damos_get_node_mem_bp( @@ -2101,6 +2106,7 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) goal->current_value = damos_get_node_mem_bp(goal); break; case DAMOS_QUOTA_NODE_MEMCG_USED_BP: + case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; default: -- cgit v1.2.3 From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:24 +0800 Subject: mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() Patch series "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM", v2. In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. This patch (of 2): In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 6 ++++-- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- mm/damon/stat.c | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0edf41d36ea1..9ee026c2db53 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index a9c11d2d37b0..82546d138a5a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2818,6 +2818,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. + * @min_sz_region: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2828,7 +2829,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * Return: 0 on success, negative error code otherwise. */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end) + unsigned long *start, unsigned long *end, + unsigned long min_sz_region) { struct damon_addr_range addr_range; @@ -2841,7 +2843,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); + return damon_set_regions(t, &addr_range, 1, min_sz_region); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 42b9a656f9de..49b4bc294f4e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + param_ctx->min_sz_region); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7ba3d0f9a19a..e30811cafe90 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, - &monitor_region_end); + &monitor_region_end, + DAMON_MIN_REGION); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index bf8626859902..ed8e3629d31a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (!target) goto free_out; damon_add_target(ctx, target); - if (damon_set_region_biggest_system_ram_default(target, &start, &end)) + if (damon_set_region_biggest_system_ram_default(target, &start, &end, + ctx->min_sz_region)) goto free_out; return ctx; free_out: -- cgit v1.2.3 From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:21 +0100 Subject: mm: add vma_desc_size(), vma_desc_pages() helpers It's useful to be able to determine the size of a VMA descriptor range used on f_op->mmap_prepare, expressed both in bytes and pages, so add helpers for both and update code that could make use of it to do so. Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 2 +- include/linux/mm.h | 10 ++++++++++ mm/secretmem.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4c90ec2fa2ea..2f344e1ed756 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -332,7 +332,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + desc->end - desc->start); + from + vma_desc_size(desc)); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..5752b0c516f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/mm/secretmem.c b/mm/secretmem.c index 9b0f5d9ec6f4..37f6d1097853 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file) static int secretmem_mmap_prepare(struct vm_area_desc *desc) { - const unsigned long len = desc->end - desc->start; + const unsigned long len = vma_desc_size(desc); if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; -- cgit v1.2.3 From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:24 +0100 Subject: mm: add remap_pfn_range_prepare(), remap_pfn_range_complete() We need the ability to split PFN remap between updating the VMA and performing the actual remap, in order to do away with the legacy f_op->mmap hook. To do so, update the PFN remap code to provide shared logic, and also make remap_pfn_range_notrack() static, as its one user, io_mapping_map_user() was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c"). Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and PFN parameters, and remap_pfn_range_complete() which accepts the same parameters as remap_pfn_rangte(). remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so it must be supplied with a correct PFN to do so. While we're here, also clean up the duplicated #ifdef __HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block. We keep these internal to mm as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++-- mm/internal.h | 4 ++ mm/memory.c | 132 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 110 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5752b0c516f2..ca5565f4fac4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); diff --git a/mm/internal.h b/mm/internal.h index 56a9a714709a..5ca1e7842b19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1677,4 +1677,8 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index f13b20b702f6..8e02b8d75535 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2900,6 +2900,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2912,31 +2931,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2957,7 +2952,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -3002,23 +2997,9 @@ void pfnmap_track_ctx_release(struct kref *ref) pfnmap_untrack(ctx->pfn, ctx->size); kfree(ctx); } -#endif /* __HAVE_PFNMAP_TRACKING */ -/** - * remap_pfn_range - remap kernel memory to userspace - * @vma: user vma to map to - * @addr: target page aligned user address to start at - * @pfn: page frame number of kernel physical memory address - * @size: size of mapping area - * @prot: page protection flags for this mapping - * - * Note: this is only safe if the mm semaphore is held when called. - * - * Return: %0 on success, negative error code otherwise. - */ -#ifdef __HAVE_PFNMAP_TRACKING -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct pfnmap_track_ctx *ctx = NULL; int err; @@ -3054,15 +3035,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, return err; } +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} #else -int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range_notrack(vma, addr, pfn, size, prot); } #endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); + if (err) + return err; + + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to -- cgit v1.2.3 From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:25 +0100 Subject: mm: abstract io_remap_pfn_range() based on PFN The only instances in which we customise this function are ones in which we customise the PFN used. Instances where architectures were not passing the pgprot value through pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so we can simply always pass pgprot through this function. Use this fact to simplify the use of io_remap_pfn_range(), by abstracting the PFN via io_remap_pfn_range_pfn() and using this instead of providing a general io_remap_pfn_range() function per-architecture. Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgtable.h | 3 --- arch/mips/alchemy/common/setup.c | 9 +++++---- arch/mips/include/asm/pgtable.h | 5 ++--- arch/sparc/include/asm/pgtable_32.h | 12 ++++-------- arch/sparc/include/asm/pgtable_64.h | 12 ++++-------- include/linux/mm.h | 19 ++++++++++++++----- 6 files changed, 29 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 5a394be09c35..d606afbabce1 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) -#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) - #endif /* __ASM_CSKY_PGTABLE_H */ diff --git a/arch/mips/alchemy/common/setup.c b/arch/mips/alchemy/common/setup.c index a7a6d31a7a41..c35b4f809d51 100644 --- a/arch/mips/alchemy/common/setup.c +++ b/arch/mips/alchemy/common/setup.c @@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size) return phys_addr; } -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size); - return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot); + return phys_addr >> PAGE_SHIFT; } -EXPORT_SYMBOL(io_remap_pfn_range); +EXPORT_SYMBOL(io_remap_pfn_range_pfn); + #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index ae73ecf4c41a..9c06a612d33a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, */ #ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size); -int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr, - unsigned long pfn, unsigned long size, pgprot_t prot); -#define io_remap_pfn_range io_remap_pfn_range +unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size); +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #else #define fixup_bigphys_addr(addr, size) (addr) #endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index f1538a48484a..a9f802d1dd64 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -395,12 +395,8 @@ __get_iospace (unsigned long addr) #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long long offset, space, phys_base; @@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, space = GET_IOSPACE(pfn); phys_base = offset | (space << 32ULL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 64b85ff9c766..615f460c50af 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr); #define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4)) #define GET_PFN(pfn) (pfn & 0x0fffffffffffffffUL) -int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, - unsigned long, pgprot_t); - void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte); @@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm, return 0; } -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot) +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; int space = GET_IOSPACE(pfn); @@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, phys_base = offset | (((unsigned long) space) << 32UL); - return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot); + return phys_base >> PAGE_SHIFT; } -#define io_remap_pfn_range io_remap_pfn_range +#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn static inline unsigned long __untagged_addr(unsigned long start) { diff --git a/include/linux/mm.h b/include/linux/mm.h index ca5565f4fac4..4441ceec913f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) -- cgit v1.2.3 From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +- include/linux/mm.h | 74 ++++++++++++++++++++ include/linux/mm_types.h | 53 ++++++++++++++ mm/util.c | 146 ++++++++++++++++++++++++++++++++++++--- mm/vma.c | 113 ++++++++++++++++++++++-------- tools/testing/vma/vma_internal.h | 98 ++++++++++++++++++++++++-- 6 files changed, 441 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..8cf9547a881c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4441ceec913f..2d060081caa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) return vma_desc_size(desc) >> PAGE_SHIFT; } +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5021047485a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,6 +773,56 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -796,6 +846,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* diff --git a/mm/util.c b/mm/util.c index 8989d5767528..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio); #endif /** - * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * __compat_vma_mmap() - See description for compat_vma_mmap() * for details. This is the same operation, only with a specific file operations * struct which may or may not be the same as vma->vm_file->f_op. * @f_op: The file operations whose .mmap_prepare() hook is specified. @@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio); * @vma: The VMA to apply the .mmap_prepare() hook to. * Returns: 0 on success or error. */ -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { @@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -EXPORT_SYMBOL(__compat_vma_mmap_prepare); +EXPORT_SYMBOL(__compat_vma_mmap); /** - * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA. + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * @@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * .mmap_prepare() hook, as we are in a different context when we invoke the * .mmap() hook, already having a VMA to deal with. * - * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * compat_vma_mmap() is a compatibility function that takes VMA state, * establishes a struct vm_area_desc descriptor, passes to the underlying * .mmap_prepare() hook and applies any changes performed by it. * @@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare); * * Returns: 0 on success or error. */ -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } -EXPORT_SYMBOL(compat_vma_mmap_prepare); +EXPORT_SYMBOL(compat_vma_mmap); static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, const struct page *page) @@ -1280,6 +1283,127 @@ again: } } +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + #ifdef CONFIG_MMU /** * folio_pte_batch - detect a PTE batch for a large folio diff --git a/mm/vma.c b/mm/vma.c index eb2f711c03a1..919d1fc63a52 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -34,7 +34,9 @@ struct mmap_state { struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ - bool check_ksm_early; + bool check_ksm_early :1; + /* If we map new, hold the file rmap lock on mapping. */ + bool hold_file_rmap_lock :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -1754,7 +1756,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) unlink_file_vma_batch_process(vb); } -static void vma_link_file(struct vm_area_struct *vma) +static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; @@ -1763,7 +1765,8 @@ static void vma_link_file(struct vm_area_struct *vma) mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + if (!hold_rmap_lock) + i_mmap_unlock_write(mapping); } } @@ -1777,7 +1780,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) vma_start_write(vma); vma_iter_store_new(&vmi, vma); - vma_link_file(vma); + vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; @@ -2311,17 +2314,33 @@ static void update_ksm_flags(struct mmap_state *map) map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } +static void set_desc_from_map(struct vm_area_desc *desc, + const struct mmap_state *map) +{ + desc->start = map->addr; + desc->end = map->end; + + desc->pgoff = map->pgoff; + desc->vm_file = map->file; + desc->vm_flags = map->vm_flags; + desc->page_prot = map->page_prot; +} + /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * + * As a result it sets up the @map and @desc objects. + * * @map: Mapping state. + * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ -static int __mmap_setup(struct mmap_state *map, struct list_head *uf) +static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, + struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; @@ -2378,6 +2397,7 @@ static int __mmap_setup(struct mmap_state *map, struct list_head *uf) */ vms_clean_up_area(vms, &map->mas_detach); + set_desc_from_map(desc, map); return 0; } @@ -2479,7 +2499,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; - vma_link_file(vma); + vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below @@ -2539,6 +2559,17 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } +static void call_action_prepare(struct mmap_state *map, + struct vm_area_desc *desc) +{ + struct mmap_action *action = &desc->action; + + mmap_action_prepare(action, desc); + + if (action->hide_from_rmap_until_complete) + map->hold_file_rmap_lock = true; +} + /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. @@ -2550,34 +2581,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) * * Returns 0 on success, or an error code otherwise. */ -static int call_mmap_prepare(struct mmap_state *map) +static int call_mmap_prepare(struct mmap_state *map, + struct vm_area_desc *desc) { int err; - struct vm_area_desc desc = { - .mm = map->mm, - .file = map->file, - .start = map->addr, - .end = map->end, - - .pgoff = map->pgoff, - .vm_file = map->file, - .vm_flags = map->vm_flags, - .page_prot = map->page_prot, - }; /* Invoke the hook. */ - err = vfs_mmap_prepare(map->file, &desc); + err = vfs_mmap_prepare(map->file, desc); if (err) return err; + call_action_prepare(map, desc); + /* Update fields permitted to be changed. */ - map->pgoff = desc.pgoff; - map->file = desc.vm_file; - map->vm_flags = desc.vm_flags; - map->page_prot = desc.page_prot; + map->pgoff = desc->pgoff; + map->file = desc->vm_file; + map->vm_flags = desc->vm_flags; + map->page_prot = desc->page_prot; /* User-defined fields. */ - map->vm_ops = desc.vm_ops; - map->vm_private_data = desc.private_data; + map->vm_ops = desc->vm_ops; + map->vm_private_data = desc->private_data; return 0; } @@ -2619,22 +2642,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map) return false; } +static int call_action_complete(struct mmap_state *map, + struct vm_area_desc *desc, + struct vm_area_struct *vma) +{ + struct mmap_action *action = &desc->action; + int ret; + + ret = mmap_action_complete(action, vma); + + /* If we held the file rmap we need to release it. */ + if (map->hold_file_rmap_lock) { + struct file *file = vma->vm_file; + + i_mmap_unlock_write(file->f_mapping); + } + return ret; +} + static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); + struct vm_area_desc desc = { + .mm = mm, + .file = file, + .action = { + .type = MMAP_NOTHING, /* Default to no further action. */ + }, + }; + bool allocated_new = false; + int error; map.check_ksm_early = can_set_ksm_flags_early(&map); - error = __mmap_setup(&map, uf); + error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) - error = call_mmap_prepare(&map); + error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; @@ -2653,6 +2702,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; + allocated_new = true; } if (have_mmap_prepare) @@ -2660,6 +2710,13 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); + if (have_mmap_prepare && allocated_new) { + error = call_action_complete(&map, &desc, vma); + + if (error) + return error; + } + return addr; /* Accounting was done by __mmap_setup(). */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..d873667704e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -275,6 +275,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -298,6 +349,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -1326,12 +1380,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1404,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1442,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1475,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- cgit v1.2.3 From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:29 +0100 Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare Since we can now perform actions after the VMA is established via mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once the VMA is setup. We also make changes throughout hugetlbfs to make this possible. Note that we must hide newly established hugetlb VMAs from the rmap until the operation is entirely complete as we establish a hugetlb lock during VMA setup that can be raced by rmap users. Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Tested-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 ++++++++++++++++++------- include/linux/hugetlb.h | 9 +++-- include/linux/hugetlb_inline.h | 15 +++++--- mm/hugetlb.c | 77 ++++++++++++++++++++++++------------------ 4 files changed, 95 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ce8e40d35032..3919fca56553 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); - vma->vm_ops = &hugetlb_vm_ops; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags |= VM_NORESERVE; if (hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags) < 0) + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -1220,7 +1240,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..2387513d6ae5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7774c286b3b7..86e672fcb305 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1201,20 +1206,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (unsigned long)map); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7287,12 +7307,6 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } - /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7423,16 +7437,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } -- cgit v1.2.3 From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:30 +0100 Subject: mm: add shmem_zero_setup_desc() Add the ability to set up a shared anonymous mapping based on a VMA descriptor rather than a VMA. This is a prerequisite for converting to the char mm driver to use the mmap_prepare hook. Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 3 ++- mm/shmem.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..5b368f9549d6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); diff --git a/mm/shmem.c b/mm/shmem.c index 8b9fcdd144c8..da1df4270309 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5880,14 +5880,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -5895,7 +5890,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -5907,6 +5913,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) return 0; } +/** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space -- cgit v1.2.3 From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Oct 2025 16:44:25 -0700 Subject: memcg: manually uninline __memcg_memory_event __memcg_memory_event() has been unnecessarily marked inline even when it is not really performance critical. It is usually called to track extreme conditions. Over the time, it has evolved to include more functionality and inlining it is causing more harm. Before the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35645 10574 4192 50411 c4eb mm/memcontrol.o 54738 1658 0 56396 dc4c net/ipv4/tcp_input.o 34644 1065 0 35709 8b7d net/ipv4/tcp_output.o After the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35137 10446 4192 49775 c26f mm/memcontrol.o 54322 1562 0 55884 da4c net/ipv4/tcp_input.o 34492 1017 0 35509 8ab5 net/ipv4/tcp_output.o [akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph] Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 32 ++------------------------------ mm/memcontrol.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe254813123..8c0f15e5978f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 976412c8196e..025da46d9959 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1626,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning) +{ + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + + atomic_long_inc(&memcg->memory_events_local[event]); + if (!swap_event && allow_spinning) + cgroup_file_notify(&memcg->events_local_file); + + do { + atomic_long_inc(&memcg->memory_events[event]); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + break; + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + break; + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); +} +EXPORT_SYMBOL_GPL(__memcg_memory_event); + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { -- cgit v1.2.3 From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:28 +0800 Subject: mm: add a ptdesc flag to mark kernel page tables The page tables used to map the kernel and userspace often have very different handling rules. There are frequently *_kernel() variants of functions just for kernel page tables. That's not great and has lead to code duplication. Instead of having completely separate call paths, allow a 'ptdesc' to be marked as being for kernel mappings. Introduce helpers to set and clear this status. Note: this uses the PG_referenced bit. Page flags are a great fit for this since it is truly a single bit of information. Use PG_referenced itself because it's a fairly benign flag (as opposed to things like PG_lock). It's also (according to Willy) unlikely to go away any time soon. PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be cleared before freeing the page, and pages coming out of the allocator should have it cleared. Regardless, introduce an API to clear it anyway. Having symmetry in the API makes it easier to change the underlying implementation later, like if there was a need to move to a PAGE_FLAGS_CHECK_AT_FREE bit. Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d060081caa5..5c887c4ea29e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags -- cgit v1.2.3 From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:29 +0800 Subject: mm: actually mark kernel page table pages Now that the API is in place, mark kernel page table pages just after they are allocated. Unmark them just before they are freed. Note: Unconditionally clearing the 'kernel' marking (via ptdesc_clear_kernel()) would be functionally identical to what is here. But having the if() makes it logically clear that this function can be used for kernel and non-kernel page tables. Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ include/linux/mm.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..b9d2a7c79b93 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) return NULL; } + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad pagetable_free(ptdesc); return NULL; } + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) @@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_pud_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) @@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_p4d_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order return NULL; pagetable_pgd_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c887c4ea29e..8f46048875a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + __free_pages(page, compound_order(page)); } -- cgit v1.2.3 From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:31 +0800 Subject: mm: introduce pure page table freeing function The pages used for ptdescs are currently freed back to the allocator in a single location. They will shortly be freed from a second location. Create a simple helper that just frees them back to the allocator. Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f46048875a7..88c0a0fae43a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - if (ptdesc_test_kernel(pt)) ptdesc_clear_kernel(pt); - __free_pages(page, compound_order(page)); + __pagetable_free(pt); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) -- cgit v1.2.3 From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:33 +0800 Subject: mm: introduce deferred freeing for kernel page tables This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++--- mm/Kconfig | 3 +++ mm/pgtable-generic.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 88c0a0fae43a..a6fd9f5aaf30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt) */ static inline void pagetable_free(struct ptdesc *pt) { - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { ptdesc_clear_kernel(pt); - - __pagetable_free(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) diff --git a/mm/Kconfig b/mm/Kconfig index 4971436c8697..682a5c39a1a6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -906,6 +906,9 @@ config HAVE_GIGANTIC_FOLIOS def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +config ASYNC_KERNEL_PGTABLE_FREE + def_bool n + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..1c7caa8ef164 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -406,3 +406,40 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif -- cgit v1.2.3 From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:34 +0800 Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space Introduce a new IOMMU interface to flush IOTLB paging cache entries for the CPU kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically before any kernel page table page is freed and reused. This addresses the main issue with vfree() which is a common occurrence and can be triggered by unprivileged users. While this resolves the primary problem, it doesn't address some extremely rare case related to memory unplug of memory that was present as reserved memory at boot, which cannot be triggered by unprivileged users. The discussion can be found at the link below. Enable SVA on x86 architecture since the IOMMU can now receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Suggested-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + drivers/iommu/iommu-sva.c | 32 ++++++++++++++++++++++++++++---- include/linux/iommu.h | 4 ++++ mm/pgtable-generic.c | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..a3700766a8c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -279,6 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index a0442faad952..d236aef80a8d 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static bool iommu_sva_present; +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -77,9 +80,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (!group) return ERR_PTR(-ENODEV); - if (IS_ENABLED(CONFIG_X86)) - return ERR_PTR(-EOPNOTSUPP); - mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ @@ -135,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains); + if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = true; + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -178,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + iommu_sva_present = false; + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -315,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + guard(mutex)(&iommu_sva_lock); + if (!iommu_sva_present) + return; + + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1c7caa8ef164..8c22be79b734 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -430,6 +431,7 @@ static void kernel_pgtable_work_func(struct work_struct *work) list_splice_tail_init(&kernel_pgtable_work.list, &page_list); spin_unlock(&kernel_pgtable_work.lock); + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); list_for_each_entry_safe(pt, next, &page_list, pt_list) __pagetable_free(pt); } -- cgit v1.2.3 From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:41 +0800 Subject: mm, swap: cleanup swap entry allocation parameter We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap: use the swap table for the swap cache and switch API"). Before that commit the GFP parameter is already almost identical for all callers, so nothing changed by that commit. Swap table just moved the GFP to lower layer and make it more defined and changes depend on atomic or sleep allocation. Now this parameter is no longer used, just remove it. No behavior change. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swapfile.c | 3 +-- mm/vmscan.c | 4 ++-- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..a4b264817735 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index da1df4270309..e1dc2d8e939c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1617,7 +1617,7 @@ try_split: folio_mark_uptodate(folio); } - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; diff --git a/mm/swapfile.c b/mm/swapfile.c index 808052319c0b..d87b562ae661 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1417,7 +1417,6 @@ static bool swap_sync_discard(void) /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1425,7 +1424,7 @@ static bool swap_sync_discard(void) * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; diff --git a/mm/vmscan.c b/mm/vmscan.c index ecc90517b791..c23c9616052a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,7 +1318,7 @@ retry: split_folio_to_list(folio, folio_list)) goto activate_locked; } - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) { + if (folio_alloc_swap(folio)) { int __maybe_unused order = folio_order(folio); if (!folio_test_large(folio)) @@ -1334,7 +1334,7 @@ retry: } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) + if (folio_alloc_swap(folio)) goto activate_locked_split; } /* -- cgit v1.2.3 From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:25 -0700 Subject: mm/damon/core: add damon_target->obsolete for pin-point removal Patch series "mm/damon: support pin-point targets removal". DAMON maintains the targets in a list, and allows committing only an entire list of targets having the new parameters. Targets having same index on the lists are treated as matching source and destination targets. If an existing target cannot find a matching one in the sources list, the target is removed. This means that there is no way to remove only a specific monitoring target in the middle of the current targets list. Such pin-point target removal is really needed in some use cases, though. Monitoring access patterns on virtual address spaces of processes that spawned from the same ancestor is one example. If a process of the group is terminated, the user may want to remove the matching DAMON target as soon as possible, to save in-kernel memory usage for the unnecessary target data. The user may also want to do that without turning DAMON off or removing unnecessary targets, to keep the current monitoring results for other active processes. Extend DAMON kernel API and sysfs ABI to support the pin-point removal in the following way. For API, add a new damon_target field, namely 'obsolete'. If the field on parameters commit source target is set, it means the matching destination target is obsolete. Then the parameters commit logic removes the destination target from the existing targets list. For sysfs ABI, add a new file under the target directory, namely 'obsolete_target'. It is connected with the 'obsolete' field of the commit source targets, so internally using the new API. Also add a selftest for the new feature. The related helper scripts for manipulating the sysfs interface and dumping in-kernel DAMON status are also extended for this. Note that the selftest part was initially posted as an individual RFC series [1], but now merged into this one. Bijan Tabatabai has originally reported this issue, and participated in this solution design on a GitHub issue [1] for DAMON user-space tool. This patch (of 9): DAMON's monitoring targets parameters update function, damon_commit_targets(), is not providing a way to remove a target in the middle of the existing targets list. Extend the API by adding a field to struct damon_target. If the field of a damon_commit_targets() source target is set, it indicates the matching target on the existing targets list is obsolete. damon_commit_targets() understands that and removes those from the list, while respecting the index based matching for other non-obsolete targets. Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org Link: https://github.com/damonitor/damo/issues/36 [1] Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 9ee026c2db53..f3566b978cdf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index 769da97fcb26..06ad359024ad 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -479,6 +479,7 @@ struct damon_target *damon_new_target(void) t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); INIT_LIST_HEAD(&t->list); + t->obsolete = false; return t; } @@ -1187,7 +1188,11 @@ static int damon_commit_targets( damon_for_each_target_safe(dst_target, next, dst) { src_target = damon_nth_target(i++, src); - if (src_target) { + /* + * If src target is obsolete, do not commit the parameters to + * the dst target, and further remove the dst target. + */ + if (src_target && !src_target->obsolete) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), @@ -1210,6 +1215,9 @@ static int damon_commit_targets( damon_for_each_target_safe(src_target, next, src) { if (j++ < i) continue; + /* target to remove has no matching dst */ + if (src_target->obsolete) + return -EINVAL; new_target = damon_new_target(); if (!new_target) return -ENOMEM; -- cgit v1.2.3 From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 24 Oct 2025 10:09:02 +0100 Subject: mm/vma: small VMA lock cleanups We declare vma_start_read() as a static function in mm/mmap_lock.c, so there is no need to provide a stub for !CONFIG_PER_VMA_LOCK. __is_vma_write_locked() is declared in a header and should therefore be static inline. Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to make precedence clear. Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..e05da70dc0cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 Oct 2025 03:56:38 +0100 Subject: mm: make INVALID_PHYS_ADDR a generic macro INVALID_PHYS_ADDR has very similar definitions across the code base. Hence just move that inside header for more generic usage. Also drop the now redundant ones which are no longer required. Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev [s390] Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 -- arch/s390/boot/vmem.c | 1 - drivers/vdpa/vdpa_user/iova_domain.h | 2 -- include/linux/mm.h | 2 ++ kernel/dma/swiotlb.c | 2 -- 5 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..94e29e3574ff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -470,8 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, mutex_unlock(&fixmap_lock); } -#define INVALID_PHYS_ADDR (-1ULL) - static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, enum pgtable_type pgtable_type) { diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index cea3de4dce8c..fbe64ffdfb96 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -16,7 +16,6 @@ #include "decompressor.h" #include "boot.h" -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) struct ctlreg __bootdata_preserved(s390_invalid_asce); #ifdef CONFIG_PROC_FS diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 775cad5238f3..a923971a64f5 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -17,8 +17,6 @@ #define IOVA_START_PFN 1 -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - #define BOUNCE_MAP_SHIFT 12 #define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT) #define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1)) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6fd9f5aaf30..7bcd9e6fbc3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0d37da3d95b6..a547c7693135 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -61,8 +61,6 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) - /** * struct io_tlb_slot - IO TLB slot descriptor * @orig_addr: The original address corresponding to a mapped entry. -- cgit v1.2.3 From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:07 +0800 Subject: mm/swap: do not choose swap device according to numa node Patch series "mm/swapfile.c: select swap devices of default priority round robin", v5. Currently, on system with multiple swap devices, swap allocation will select one swap device according to priority. The swap device with the highest priority will be chosen to allocate firstly. People can specify a priority from 0 to 32767 when swapon a swap device, or the system will set it from -2 then downwards by default. Meanwhile, on NUMA system, the swap device with node_id will be considered first on that NUMA node of the node_id. In the current code, an array of plist, swap_avail_heads[nid], is used to organize swap devices on each NUMA node. For each NUMA node, there is a plist organizing all swap devices. The 'prio' value in the plist is the negated value of the device's priority due to plist being sorted from low to high. The swap device owning one node_id will be promoted to the front position on that NUMA node, then other swap devices are put in order of their default priority. E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as swap devices. Current behaviour: their priorities will be(note that -1 is skipped): NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 0B -2 /dev/zram1 partition 16G 0B -3 /dev/zram2 partition 16G 0B -4 /dev/zram3 partition 16G 0B -5 And their positions in the 8 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ zram1 -> zram0 -> zram2 -> zram3 prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ zram2 -> zram0 -> zram1 -> zram3 prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ zram3 -> zram0 -> zram1 -> zram2 prio:1 prio:2 prio:3 prio:4 swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:2 prio:3 prio:4 prio:5 The adjustment for swap device with node_id intended to decrease the pressure of lock contention for one swap device by taking different swap device on different node. The adjustment was introduced in commit a2468cc9bfdf ("swap: choose swap device according to numa node"). However, the adjustment is a little coarse-grained. On the node, the swap device sharing the node's id will always be selected firstly by node's CPUs until exhausted, then next one. And on other nodes where no swap device shares its node id, swap device with priority '-2' will be selected firstly until exhausted, then next with priority '-3'. This is the swapon output during the process high pressure vm-scability test is being taken. It's clearly showing zram0 is heavily exploited until exhausted. =================================== [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 The node based strategy on selecting swap device is much better then the old way one by one selecting swap device. However it is still unreasonable because swap devices are assumed to have similar accessing speed if no priority is specified when swapon. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. So in this patchset, change is made to select the swap device round robin if default priority. In code, the plist array swap_avail_heads[nid] is replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. Meanwhile, on top of the revert, further change is taken to make any device w/o specified priority get the same default priority '-1'. Surely, swap device with specified priority are always put foremost, this is not impacted. If you care about their different accessing speed, then use 'swapon -p xx' to deploy priority for your swap devices. New behaviour: swap_avail_list: /* one global available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:1 prio:1 prio:1 This is the swapon output during the process high pressure vm-scability being taken, all is selected round robin: ======================================= [root@hp-dl385g10-03 linux]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 12.6G -1 /dev/zram1 partition 16G 12.6G -1 /dev/zram2 partition 16G 12.6G -1 /dev/zram3 partition 16G 12.6G -1 With the change, we can see about 18% efficiency promotion as below: vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) Before: After: System time: 637.92 s 526.74 s (lower is better) Sum Throughput: 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 114.40 MB/s 135.72 MB/s (higher is better) free latency: 10138455.99 us 6810119.01 us (low is better) This patch (of 2): This reverts commit a2468cc9bfdf ("swap: choose swap device according to numa node"). After this patch, the behaviour will change back to pre-commit a2468cc9bfdf. Means the priority will be set from -1 then downwards by default, and when swapping, it will exhault swap device one by one according to priority from high to low. This is preparation work for later change. [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Kairui Song Cc: Barry Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/index.rst | 1 - Documentation/admin-guide/mm/swap_numa.rst | 78 ----------------------------- include/linux/swap.h | 11 +--- mm/swapfile.c | 80 ++++++------------------------ 4 files changed, 15 insertions(+), 155 deletions(-) delete mode 100644 Documentation/admin-guide/mm/swap_numa.rst (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index ebc83ca20fdc..bbb563cba5d2 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -39,7 +39,6 @@ the Linux memory management. shrinker_debugfs slab soft-dirty - swap_numa transhuge userfaultfd zswap diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst deleted file mode 100644 index 2e630627bcee..000000000000 --- a/Documentation/admin-guide/mm/swap_numa.rst +++ /dev/null @@ -1,78 +0,0 @@ -=========================================== -Automatically bind swap device to numa node -=========================================== - -If the system has more than one swap device and swap device has the node -information, we can make use of this information to decide which swap -device to use in get_swap_pages() to get better performance. - - -How to use this feature -======================= - -Swap device has priority and that decides the order of it to be used. To make -use of automatically binding, there is no need to manipulate priority settings -for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and -swapB, with swapA attached to node 0 and swapB attached to node 1, are going -to be swapped on. Simply swapping them on by doing:: - - # swapon /dev/swapA - # swapon /dev/swapB - -Then node 0 will use the two swap devices in the order of swapA then swapB and -node 1 will use the two swap devices in the order of swapB then swapA. Note -that the order of them being swapped on doesn't matter. - -A more complex example on a 4 node machine. Assume 6 swap devices are going to -be swapped on: swapA and swapB are attached to node 0, swapC is attached to -node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. -The way to swap them on is the same as above:: - - # swapon /dev/swapA - # swapon /dev/swapB - # swapon /dev/swapC - # swapon /dev/swapD - # swapon /dev/swapE - # swapon /dev/swapF - -Then node 0 will use them in the order of:: - - swapA/swapB -> swapC -> swapD -> swapE -> swapF - -swapA and swapB will be used in a round robin mode before any other swap device. - -node 1 will use them in the order of:: - - swapC -> swapA -> swapB -> swapD -> swapE -> swapF - -node 2 will use them in the order of:: - - swapD/swapE -> swapA -> swapB -> swapC -> swapF - -Similaly, swapD and swapE will be used in a round robin mode before any -other swap devices. - -node 3 will use them in the order of:: - - swapF -> swapA -> swapB -> swapC -> swapD -> swapE - - -Implementation details -====================== - -The current code uses a priority based list, swap_avail_list, to decide -which swap device to use and if multiple swap devices share the same -priority, they are used round robin. This change here replaces the single -global swap_avail_list with a per-numa-node list, i.e. for each numa node, -it sees its own priority based list of available swap devices. Swap -device's priority can be promoted on its matching node's swap_avail_list. - -The current swap device's priority is set as: user can set a >=0 value, -or the system will pick one starting from -1 then downwards. The priority -value in the swap_avail_list is the negated value of the swap device's -due to plist being sorted from low to high. The new policy doesn't change -the semantics for priority >=0 cases, the previous starting from -1 then -downwards now becomes starting from -2 then downwards and -1 is reserved -as the promoted value. So if multiple swap devices are attached to the same -node, they will all be promoted to priority -1 on that node's plist and will -be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index a4b264817735..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 125d893bb706..ce3580e2f4f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +static int least_priority; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -1130,7 +1130,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1159,8 +1158,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1169,7 +1167,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1202,8 +1199,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1346,16 +1342,14 @@ static bool swap_alloc_fast(swp_entry_t *entry, static bool swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); @@ -1380,7 +1374,7 @@ start_over: * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&si->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); @@ -1394,11 +1388,10 @@ start_over: static bool swap_sync_discard(void) { bool ret = false; - int nid = numa_node_id(); struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { if (si->flags & SWP_PAGE_DISCARD) @@ -2709,25 +2702,11 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - if (prio >= 0) si->prio = prio; else @@ -2737,16 +2716,7 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2919,15 +2889,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; - int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } + si->avail_list.prio--; } least_priority++; } @@ -3168,9 +3134,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3209,8 +3174,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3467,9 +3431,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -4079,7 +4040,6 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; - int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; @@ -4098,8 +4058,8 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, + avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4111,18 +4071,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* -- cgit v1.2.3 From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:28 +0000 Subject: mm: convert memory block states (MEM_*) macros to enum Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2. The MEM_* constants indicating the state of a memory block are currently defined as macros, meaning their definitions will be omitted from the debuginfo on most kernel builds. This makes it harder for debuggers to correctly map the block state at runtime, which can be quite useful when analysing errors related to memory hot plugging and unplugging with tools such as drgn. Converting the constants to an enum ensures the correct information is emitted by the compiler and available for the debugger, without needing to hard-code them into the debugger and track their changes. This patch series aims to replace the current macros with a newly created enum named memory_block_state, while also taking advantage of the compile time guarantees that we get when using enums. The first patch does the conversion of the macros to an enum, while the 2nd and 3rd patches use this enum to clean up some type declarations and make sure that only valid values are used. This patch (of 3): Converting the MEM_* constants from macros to an enum ensures that their values will be correctly emitted in the debug symbols, making it easier to trace the meaning of each value when debugging with tools such as drgn, without the need to hard-code the values. Since the values are mutually exclusive and they are not exposed directly to userspace, I also dropped the misleading pattern (1< Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..f4e358477c6a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,6 +64,18 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, + MEM_PREPARE_ONLINE, + MEM_FINISH_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) - struct memory_notify { /* * The altmap_start_pfn and altmap_nr_pages fields are designated for -- cgit v1.2.3 From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:30 +0000 Subject: mm: change type of state in struct memory_block The state of a memory block should be restricted to values specified in the documentation of the memory hotplug API. However, since the state field in the memory_block struct was defined as an unsigned long, this restriction was not enforced at compile time. With the introduction of the enum memory_block_state, it is now possible to incorporate the desired semantics in the field declaration and enforce these restrictions at compile time. [akpm@linux-foundation.org: fix whitespace, per Randy] Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 2 +- include/linux/memory.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 6d84a02cfa5d..3d17dd774947 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -198,7 +198,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, break; default: WARN_ON(1); - return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); + return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state); } return sysfs_emit(buf, "%s\n", output); diff --git a/include/linux/memory.h b/include/linux/memory.h index f4e358477c6a..ca20cbdd71f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -78,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* -- cgit v1.2.3 From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:32 +0000 Subject: mm: change type of parameter for memory_notify memory_notify() is responsible for sending events related to memory hotplugging to a notification queue. Since all the events must match one of the values from the enum memory_block_state, it is appropriate to change the function parameter type to make this condition explicit at compile time. Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 3d17dd774947..c03f3b5e5e6f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -204,9 +204,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%s\n", output); } -int memory_notify(unsigned long val, void *v) +int memory_notify(enum memory_block_state state, void *v) { - return blocking_notifier_call_chain(&memory_chain, val, v); + return blocking_notifier_call_chain(&memory_chain, state, v); } #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) diff --git a/include/linux/memory.h b/include/linux/memory.h index ca20cbdd71f2..ca3eb1db6cc8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, -- cgit v1.2.3 From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/memory-failure.h | 17 +++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + mm/Kconfig | 1 + mm/memory-failure.c | 145 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/memory-failure.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 2625bc3d53d8..5cf6873569d3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11557,6 +11557,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7bcd9e6fbc3c..b636d12bb651 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4285,6 +4285,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/Kconfig b/mm/Kconfig index eae03b14f7de..d548976d0e0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -741,6 +741,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 560884dd6250..77391b6f9f76 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -885,6 +890,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1277,7 +1283,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -2147,6 +2153,135 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pfn(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + unsigned long pfn) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pfn, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pfn, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct address_space *mapping, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { + if (vma->vm_mm == t->mm) + add_to_kill_pfn(t, vma, to_kill, pfn); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2196,6 +2331,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); -- cgit v1.2.3 From f49ae86483c494ddc793d889f6df5ea68d138569 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:54 +0000 Subject: memregion: Drop unused IORES_DESC_* parameter from cpu_cache_invalidate_memregion() The res_desc parameter was originally introduced for documentation purposes and with the idea that with HDM-DB CXL invalidation could be triggered from the device. That has not come to pass and the continued existence of the option is confusing when we add a range in the following patch which might not be a strict subset of the res_desc. So avoid that confusion by dropping the parameter. Link: https://lore.kernel.org/linux-mm/686eedb25ed02_24471002e@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Dan Williams Suggested-by: Dan Williams Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 2 +- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 7 +++---- 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9..4019b17fb65e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(int res_desc) +int cpu_cache_invalidate_memregion(void) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 71cc42d05248..d7fa76810f82 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -228,7 +228,7 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(IORES_DESC_CXL); + cpu_cache_invalidate_memregion(); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 88dc062af5f8..c43506448edf 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index de1ee5ebc851..3cdd93d40997 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY); + cpu_cache_invalidate_memregion(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c01321467789..945646bde825 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -26,8 +26,7 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for - * memregions described by @res_desc - * @res_desc: one of the IORES_DESC_* types + * memregion * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -46,7 +45,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(int res_desc); +int cpu_cache_invalidate_memregion(void); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -54,7 +53,7 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(int res_desc) +static inline int cpu_cache_invalidate_memregion(void) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; -- cgit v1.2.3 From b43652d867cf2a5f31b14e3d9a320ad01fca0992 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:55 +0000 Subject: memregion: Support fine grained invalidate by cpu_cache_invalidate_memregion() Extend cpu_cache_invalidate_memregion() to support invalidating a particular range of memory by introducing start and length parameters. Control of types of invalidation is left for when use cases turn up. For now everything is Clean and Invalidate. Where the range is unknown, use the provided cpu_cache_invalidate_all() helper to act as documentation of intent in a fashion that is clearer than passing (0, -1) to cpu_cache_invalidate_memregion(). Signed-off-by: Yicong Yang Reviewed-by: Dan Williams Acked-by: Davidlohr Bueso Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- arch/x86/mm/pat/set_memory.c | 2 +- drivers/cxl/core/region.c | 5 ++++- drivers/nvdimm/region.c | 2 +- drivers/nvdimm/region_devs.c | 2 +- include/linux/memregion.h | 13 +++++++++++-- 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 4019b17fb65e..292c7202faed 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -368,7 +368,7 @@ bool cpu_cache_has_invalidate_memregion(void) } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); -int cpu_cache_invalidate_memregion(void) +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d7fa76810f82..410e41cef5d3 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -228,7 +228,10 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) return -ENXIO; } - cpu_cache_invalidate_memregion(); + if (!cxlr->params.res) + return -ENXIO; + cpu_cache_invalidate_memregion(cxlr->params.res->start, + resource_size(cxlr->params.res)); return 0; } diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index c43506448edf..42e982db5b04 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -110,7 +110,7 @@ static void nd_region_remove(struct device *dev) * here is ok. */ if (cpu_cache_has_invalidate_memregion()) - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); } static int child_notify(struct device *dev, void *data) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3cdd93d40997..e27fc380f6c0 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -90,7 +90,7 @@ static int nd_region_invalidate_memregion(struct nd_region *nd_region) } } - cpu_cache_invalidate_memregion(); + cpu_cache_invalidate_all(); out: for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; diff --git a/include/linux/memregion.h b/include/linux/memregion.h index 945646bde825..a55f62cc5266 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -27,6 +27,9 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for * memregion + * @start: start physical address of the target memory region. + * @len: length of the target memory region. -1 for all the regions of + * the target type. * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -45,7 +48,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(void); +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -53,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(void) +static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; } #endif + +static inline int cpu_cache_invalidate_all(void) +{ + return cpu_cache_invalidate_memregion(0, -1); +} + #endif /* _MEMREGION_H_ */ -- cgit v1.2.3 From fc45aee66223253ec5547094d7552819914abdfb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Mar 2025 00:06:29 -0400 Subject: get rid of kill_litter_super() Not used anymore. Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 7 +++++++ fs/dcache.c | 21 --------------------- fs/internal.h | 1 - fs/super.c | 8 -------- include/linux/dcache.h | 1 - include/linux/fs.h | 1 - 6 files changed, 7 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..4921b3b0662a 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1309,3 +1309,10 @@ a different length, use vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) instead. + +--- + +**mandatory** + +kill_litter_super() is gone; convert to DCACHE_PERSISTENT use (as all +in-tree filesystems have done). diff --git a/fs/dcache.c b/fs/dcache.c index 3cc6c3876177..5ee2e78a91b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3167,27 +3167,6 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) } EXPORT_SYMBOL(is_subdir); -static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) -{ - struct dentry *root = data; - if (dentry != root) { - if (d_unhashed(dentry) || !dentry->d_inode || - dentry->d_flags & DCACHE_PERSISTENT) - return D_WALK_SKIP; - - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } - } - return D_WALK_CONTINUE; -} - -void d_genocide(struct dentry *parent) -{ - d_walk(parent, parent, d_genocide_kill); -} - void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; diff --git a/fs/internal.h b/fs/internal.h index 9b2b4d116880..144686af6c36 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -227,7 +227,6 @@ extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -extern void d_genocide(struct dentry *); /* * pipe.c diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..ee001f684d2a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1284,14 +1284,6 @@ void kill_anon_super(struct super_block *sb) } EXPORT_SYMBOL(kill_anon_super); -void kill_litter_super(struct super_block *sb) -{ - if (sb->s_root) - d_genocide(sb->s_root); - kill_anon_super(sb); -} -EXPORT_SYMBOL(kill_litter_super); - int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6ec4066825e3..20a85144a00e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -198,7 +198,6 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), - DCACHE_GENOCIDE = BIT(9), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* diff --git a/include/linux/fs.h b/include/linux/fs.h index f5037c556f61..95933ceaae51 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2728,7 +2728,6 @@ void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); -- cgit v1.2.3 From ca459ca70f60ce05445845eca74c788b0d5ddb1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 Oct 2025 18:34:49 -0400 Subject: kill securityfs_recursive_remove() it's an unused alias for securityfs_remove() Acked-by: Paul Moore Signed-off-by: Al Viro --- include/linux/security.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..9e710cfee744 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2258,8 +2258,6 @@ static inline void securityfs_remove(struct dentry *dentry) #endif -#define securityfs_recursive_remove securityfs_remove - #ifdef CONFIG_BPF_SYSCALL union bpf_attr; struct bpf_map; -- cgit v1.2.3 From eb028c33451af08bb34f45c6be6967ef1c98cbd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Oct 2025 18:32:21 -0400 Subject: d_make_discardable(): warn if given a non-persistent dentry At this point there are very few call chains that might lead to d_make_discardable() on a dentry that hadn't been made persistent: calls of simple_unlink() and simple_rmdir() in configfs and apparmorfs. Both filesystems do pin (part of) their contents in dcache, but they are currently playing very unusual games with that. Converting them to more usual patterns might be possible, but it's definitely going to be a long series of changes in both cases. For now the easiest solution is to have both stop using simple_unlink() and simple_rmdir() - that allows to make d_make_discardable() warn when given a non-persistent dentry. Rather than giving them full-blown private copies (with calls of d_make_discardable() replaced with dput()), let's pull the parts of simple_unlink() and simple_rmdir() that deal with timestamps and link counts into separate helpers (__simple_unlink() and __simple_rmdir() resp.) and have those used by configfs and apparmorfs. Signed-off-by: Al Viro --- fs/configfs/dir.c | 10 ++++++++-- fs/configfs/inode.c | 3 ++- fs/dcache.c | 9 +-------- fs/libfs.c | 21 +++++++++++++++++---- include/linux/fs.h | 2 ++ security/apparmor/apparmorfs.c | 13 +++++++++---- 6 files changed, 39 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 81f4f06bc87e..e8f2f44012e9 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -400,8 +400,14 @@ static void remove_dir(struct dentry * d) configfs_remove_dirent(d); - if (d_really_is_positive(d)) - simple_rmdir(d_inode(parent),d); + if (d_really_is_positive(d)) { + if (likely(simple_empty(d))) { + __simple_rmdir(d_inode(parent),d); + dput(d); + } else { + pr_warn("remove_dir (%pd): attributes remain", d); + } + } pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1d2e3a5738d1..bcda3372e141 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -211,7 +211,8 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(d_inode(parent), dentry); + __simple_unlink(d_inode(parent), dentry); + dput(dentry); } else spin_unlock(&dentry->d_lock); } diff --git a/fs/dcache.c b/fs/dcache.c index 5ee2e78a91b3..824d620bb563 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -931,14 +931,7 @@ EXPORT_SYMBOL(dput); void d_make_discardable(struct dentry *dentry) { spin_lock(&dentry->d_lock); - /* - * By the end of the series we'll add - * WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT); - * here, but while object removal is done by a few common helpers, - * object creation tends to be open-coded (if nothing else, new inode - * needs to be set up), so adding a warning from the very beginning - * would make for much messier patch series. - */ + WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); dentry->d_flags &= ~DCACHE_PERSISTENT; dentry->d_lockref.count--; rcu_read_lock(); diff --git a/fs/libfs.c b/fs/libfs.c index 80f288a771e3..0aa630e7eb00 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -790,13 +790,27 @@ out: } EXPORT_SYMBOL(simple_empty); -int simple_unlink(struct inode *dir, struct dentry *dentry) +void __simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); +} +EXPORT_SYMBOL(__simple_unlink); + +void __simple_rmdir(struct inode *dir, struct dentry *dentry) +{ + drop_nlink(d_inode(dentry)); + __simple_unlink(dir, dentry); + drop_nlink(dir); +} +EXPORT_SYMBOL(__simple_rmdir); + +int simple_unlink(struct inode *dir, struct dentry *dentry) +{ + __simple_unlink(dir, dentry); d_make_discardable(dentry); return 0; } @@ -807,9 +821,8 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(d_inode(dentry)); - simple_unlink(dir, dentry); - drop_nlink(dir); + __simple_rmdir(dir, dentry); + d_make_discardable(dentry); return 0; } EXPORT_SYMBOL(simple_rmdir); diff --git a/include/linux/fs.h b/include/linux/fs.h index 95933ceaae51..ef842adbd418 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3621,6 +3621,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern void __simple_unlink(struct inode *, struct dentry *); +extern void __simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 391a586d0557..9b9090d38ea2 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -358,10 +358,15 @@ static void aafs_remove(struct dentry *dentry) dir = d_inode(dentry->d_parent); inode_lock(dir); if (simple_positive(dentry)) { - if (d_is_dir(dentry)) - simple_rmdir(dir, dentry); - else - simple_unlink(dir, dentry); + if (d_is_dir(dentry)) { + if (!WARN_ON(!simple_empty(dentry))) { + __simple_rmdir(dir, dentry); + dput(dentry); + } + } else { + __simple_unlink(dir, dentry); + dput(dentry); + } d_delete(dentry); dput(dentry); } -- cgit v1.2.3 From 9c7dacf5d51910f34a3bd709403f6a82ffc8c960 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:14 +0100 Subject: platform/x86: asus-armoury: add apu-mem control support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the APU memory size control under the asus-armoury module using the fw_attributes class. This allows the APU allocated memory size to be adjusted depending on the users priority. A reboot is required after change. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-5-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 98 ++++++++++++++++++++++++++++++ include/linux/platform_data/x86/asus-wmi.h | 2 + 2 files changed, 100 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index f0cb973a487e..1b972260c5dd 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -174,6 +174,7 @@ static int armoury_get_devstate(struct kobj_attribute *attr, u32 *retval, u32 de * and should perform relevant checks. * * Returns: + * * %-EINVAL - attempt to set a dangerous or unsupported value. * * %-EIO - WMI function returned an error. * * %0 - successful and retval is filled. * * %other - error from WMI call. @@ -184,6 +185,26 @@ static int armoury_set_devstate(struct kobj_attribute *attr, u32 result; int err; + /* + * Prevent developers from bricking devices or issuing dangerous + * commands that can be difficult or impossible to recover from. + */ + switch (dev_id) { + case ASUS_WMI_DEVID_APU_MEM: + /* + * A hard reset might suffice to save the device, + * but there is no value in sending these commands. + */ + if (value == 0x100 || value == 0x101) { + pr_err("Refusing to set APU memory to unsafe value: 0x%x\n", value); + return -EINVAL; + } + break; + default: + /* No problems are known for this dev_id */ + break; + } + err = asus_wmi_set_devstate(dev_id, value, retval ? retval : &result); if (err) { if (attr) @@ -599,6 +620,82 @@ static ssize_t egpu_enable_possible_values_show(struct kobject *kobj, struct kob } ASUS_ATTR_GROUP_ENUM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)"); +/* Device memory available to APU */ + +/* + * Values map for APU reserved memory (index + 1 number of GB). + * Some looks out of order, but are actually correct. + */ +static u32 apu_mem_map[] = { + [0] = 0x000, /* called "AUTO" on the BIOS, is the minimum available */ + [1] = 0x102, + [2] = 0x103, + [3] = 0x104, + [4] = 0x105, + [5] = 0x107, + [6] = 0x108, + [7] = 0x109, + [8] = 0x106, +}; + +static ssize_t apu_mem_current_value_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int err; + u32 mem; + + err = armoury_get_devstate(attr, &mem, ASUS_WMI_DEVID_APU_MEM); + if (err) + return err; + + /* After 0x000 is set, a read will return 0x100 */ + if (mem == 0x100) + return sysfs_emit(buf, "0\n"); + + for (unsigned int i = 0; i < ARRAY_SIZE(apu_mem_map); i++) { + if (apu_mem_map[i] == mem) + return sysfs_emit(buf, "%u\n", i); + } + + pr_warn("Unrecognised value for APU mem 0x%08x\n", mem); + return -EIO; +} + +static ssize_t apu_mem_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int result, err; + u32 requested, mem; + + result = kstrtou32(buf, 10, &requested); + if (result) + return result; + + if (requested >= ARRAY_SIZE(apu_mem_map)) + return -EINVAL; + mem = apu_mem_map[requested]; + + err = armoury_set_devstate(attr, mem, NULL, ASUS_WMI_DEVID_APU_MEM); + if (err) { + pr_warn("Failed to set apu_mem 0x%x: %d\n", mem, err); + return err; + } + + pr_info("APU memory changed to %uGB, reboot required\n", requested + 1); + sysfs_notify(kobj, NULL, attr->attr.name); + + asus_set_reboot_and_signal_event(); + + return count; +} + +static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return armoury_attr_enum_list(buf, ARRAY_SIZE(apu_mem_map)); +} +ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use"); + /* Simple attribute creation */ ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", "Show the current mode of charging"); @@ -618,6 +715,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED }, { &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU }, { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, + { &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM }, { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3cc235b20be4..9a6433d08973 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -136,6 +136,8 @@ /* dgpu on/off */ #define ASUS_WMI_DEVID_DGPU 0x00090020 +#define ASUS_WMI_DEVID_APU_MEM 0x000600C1 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 7725a2dc58632cb44eeef2e5b959ab7b7931298d Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:16 +0100 Subject: platform/x86: asus-armoury: add screen auto-brightness toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add screen_auto_brightness toggle supported on some laptops. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-7-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 4 ++++ include/linux/platform_data/x86/asus-wmi.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index 1b972260c5dd..c1dbaed409d2 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -707,6 +707,9 @@ ASUS_ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, "Set the panel refresh overdrive"); ASUS_ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD, "Set the panel HD mode to UHD<0> or FHD<1>"); +ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness", + ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS, + "Set the panel brightness to Off<0> or On<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); @@ -722,6 +725,7 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, { &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD }, + { &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS }, }; static int asus_fw_attr_add(void) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 9a6433d08973..3af075baf9f7 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -82,6 +82,7 @@ #define ASUS_WMI_DEVID_LID_FLIP_ROG 0x00060077 #define ASUS_WMI_DEVID_MINI_LED_MODE 0x0005001E #define ASUS_WMI_DEVID_MINI_LED_MODE2 0x0005002E +#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS 0x0005002A /* Storage */ #define ASUS_WMI_DEVID_CARDREADER 0x00080013 -- cgit v1.2.3 From d849a9f2380d5287d5133eac5bae602a147b86c2 Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Sun, 2 Nov 2025 22:53:18 +0100 Subject: platform/x86: asus-wmi: rename ASUS_WMI_DEVID_PPT_FPPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain power-related WMI macros naming consistency: rename ASUS_WMI_DEVID_PPT_FPPT to ASUS_WMI_DEVID_PPT_PL3_FPPT. Link: https://lore.kernel.org/all/cad7b458-5a7a-4975-94a1-d0c74f6f3de5@oracle.com/ Suggested-by: ALOK TIWARI Signed-off-by: Denis Benato Link: https://.../ Link: https://patch.msgid.link/20251102215319.3126879-9-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 4 ++-- include/linux/platform_data/x86/asus-wmi.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 6de633d4a748..64cfc0bf98dd 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -1218,7 +1218,7 @@ static ssize_t ppt_fppt_store(struct device *dev, if (value < PPT_TOTAL_MIN || value > PPT_TOTAL_MAX) return -EINVAL; - err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_FPPT, value, &result); + err = asus_wmi_set_devstate(ASUS_WMI_DEVID_PPT_PL3_FPPT, value, &result); if (err) { pr_warn("Failed to set ppt_fppt: %d\n", err); return err; @@ -4602,7 +4602,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj, else if (attr == &dev_attr_ppt_pl1_spl.attr) devid = ASUS_WMI_DEVID_PPT_PL1_SPL; else if (attr == &dev_attr_ppt_fppt.attr) - devid = ASUS_WMI_DEVID_PPT_FPPT; + devid = ASUS_WMI_DEVID_PPT_PL3_FPPT; else if (attr == &dev_attr_ppt_apu_sppt.attr) devid = ASUS_WMI_DEVID_PPT_APU_SPPT; else if (attr == &dev_attr_ppt_platform_sppt.attr) diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3af075baf9f7..e7c95e9d29db 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -107,7 +107,7 @@ #define ASUS_WMI_DEVID_PPT_PL1_SPL 0x001200A3 #define ASUS_WMI_DEVID_PPT_APU_SPPT 0x001200B0 #define ASUS_WMI_DEVID_PPT_PLAT_SPPT 0x001200B1 -#define ASUS_WMI_DEVID_PPT_FPPT 0x001200C1 +#define ASUS_WMI_DEVID_PPT_PL3_FPPT 0x001200C1 #define ASUS_WMI_DEVID_NV_DYN_BOOST 0x001200C0 #define ASUS_WMI_DEVID_NV_THERM_TARGET 0x001200C2 -- cgit v1.2.3 From 39ae6c50e599aa0cf62ea3d0dcf06492f7690ed7 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:19 +0100 Subject: platform/x86: asus-armoury: add ppt_* and nv_* tuning knobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the ppt_* and nv_* tuning knobs that are available via WMI methods and adds proper min/max levels plus defaults. The min/max are defined by ASUS and typically gained by looking at what they allow in the ASUS Armoury Crate application - ASUS does not share the values outside of this. It could also be possible to gain the AMD values by use of ryzenadj and testing for the minimum stable value. The general rule of thumb for adding to the match table is that if the model range has a single CPU used throughout, then the DMI match can omit the last letter of the model number as this is the GPU model. If a min or max value is not provided it is assumed that the particular setting is not supported. for example ppt_pl2_sppt_min/max is not set. If a _def is not set then the default is assumed to be _max It is assumed that at least AC settings are available so that the firmware attributes will be created - if no DC table is available and power is on DC, then reading the attributes is -ENODEV. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Tested-by: Mateusz Schyboll Tested-by: Porfet Lillian Link: https://patch.msgid.link/20251102215319.3126879-10-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-armoury.c | 302 ++++++- drivers/platform/x86/asus-armoury.h | 1294 ++++++++++++++++++++++++++++ include/linux/platform_data/x86/asus-wmi.h | 3 + 3 files changed, 1593 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c index c1dbaed409d2..d6aba68515e2 100644 --- a/drivers/platform/x86/asus-armoury.c +++ b/drivers/platform/x86/asus-armoury.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "asus-armoury.h" @@ -48,6 +49,33 @@ #define ASUS_MINI_LED_2024_STRONG 0x01 #define ASUS_MINI_LED_2024_OFF 0x02 +/* Power tunable attribute name defines */ +#define ATTR_PPT_PL1_SPL "ppt_pl1_spl" +#define ATTR_PPT_PL2_SPPT "ppt_pl2_sppt" +#define ATTR_PPT_PL3_FPPT "ppt_pl3_fppt" +#define ATTR_PPT_APU_SPPT "ppt_apu_sppt" +#define ATTR_PPT_PLATFORM_SPPT "ppt_platform_sppt" +#define ATTR_NV_DYNAMIC_BOOST "nv_dynamic_boost" +#define ATTR_NV_TEMP_TARGET "nv_temp_target" +#define ATTR_NV_BASE_TGP "nv_base_tgp" +#define ATTR_NV_TGP "nv_tgp" + +#define ASUS_ROG_TUNABLE_DC 0 +#define ASUS_ROG_TUNABLE_AC 1 + +struct rog_tunables { + const struct power_limits *power_limits; + u32 ppt_pl1_spl; // cpu + u32 ppt_pl2_sppt; // cpu + u32 ppt_pl3_fppt; // cpu + u32 ppt_apu_sppt; // plat + u32 ppt_platform_sppt; // plat + + u32 nv_dynamic_boost; + u32 nv_temp_target; + u32 nv_tgp; +}; + struct asus_armoury_priv { struct device *fw_attr_dev; struct kset *fw_attr_kset; @@ -60,6 +88,9 @@ struct asus_armoury_priv { */ struct mutex egpu_mutex; + /* Index 0 for DC, 1 for AC */ + struct rog_tunables *rog_tunables[2]; + u32 mini_led_dev_id; u32 gpu_mux_dev_id; }; @@ -290,6 +321,12 @@ static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "enumeration\n"); } +static ssize_t int_type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "integer\n"); +} + /* Mini-LED mode **************************************************************/ /* Values map for mini-led modes on 2023 and earlier models. */ @@ -696,6 +733,15 @@ static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_at } ASUS_ATTR_GROUP_ENUM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use"); +/* Define helper to access the current power mode tunable values */ +static inline struct rog_tunables *get_current_tunables(void) +{ + if (power_supply_is_system_supplied()) + return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]; + + return asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]; +} + /* Simple attribute creation */ ASUS_ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2\n", "Show the current mode of charging"); @@ -712,6 +758,24 @@ ASUS_ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness", "Set the panel brightness to Off<0> or On<1>"); ASUS_ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, "Show the eGPU connection status"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl1_spl, ATTR_PPT_PL1_SPL, ASUS_WMI_DEVID_PPT_PL1_SPL, + "Set the CPU slow package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl2_sppt, ATTR_PPT_PL2_SPPT, ASUS_WMI_DEVID_PPT_PL2_SPPT, + "Set the CPU fast package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_pl3_fppt, ATTR_PPT_PL3_FPPT, ASUS_WMI_DEVID_PPT_PL3_FPPT, + "Set the CPU fastest package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_apu_sppt, ATTR_PPT_APU_SPPT, ASUS_WMI_DEVID_PPT_APU_SPPT, + "Set the APU package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(ppt_platform_sppt, ATTR_PPT_PLATFORM_SPPT, ASUS_WMI_DEVID_PPT_PLAT_SPPT, + "Set the platform package limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_dynamic_boost, ATTR_NV_DYNAMIC_BOOST, ASUS_WMI_DEVID_NV_DYN_BOOST, + "Set the Nvidia dynamic boost limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_temp_target, ATTR_NV_TEMP_TARGET, ASUS_WMI_DEVID_NV_THERM_TARGET, + "Set the Nvidia max thermal limit"); +ASUS_ATTR_GROUP_ROG_TUNABLE(nv_tgp, "nv_tgp", ASUS_WMI_DEVID_DGPU_SET_TGP, + "Set the additional TGP on top of the base TGP"); +ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(nv_base_tgp, ATTR_NV_BASE_TGP, ASUS_WMI_DEVID_DGPU_BASE_TGP, + "Read the base TGP value"); /* If an attribute does not require any special case handling add it here */ static const struct asus_attr_group armoury_attr_groups[] = { @@ -720,6 +784,16 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, { &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM }, + { &ppt_pl1_spl_attr_group, ASUS_WMI_DEVID_PPT_PL1_SPL }, + { &ppt_pl2_sppt_attr_group, ASUS_WMI_DEVID_PPT_PL2_SPPT }, + { &ppt_pl3_fppt_attr_group, ASUS_WMI_DEVID_PPT_PL3_FPPT }, + { &ppt_apu_sppt_attr_group, ASUS_WMI_DEVID_PPT_APU_SPPT }, + { &ppt_platform_sppt_attr_group, ASUS_WMI_DEVID_PPT_PLAT_SPPT }, + { &nv_dynamic_boost_attr_group, ASUS_WMI_DEVID_NV_DYN_BOOST }, + { &nv_temp_target_attr_group, ASUS_WMI_DEVID_NV_THERM_TARGET }, + { &nv_base_tgp_attr_group, ASUS_WMI_DEVID_DGPU_BASE_TGP }, + { &nv_tgp_attr_group, ASUS_WMI_DEVID_DGPU_SET_TGP }, + { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, @@ -728,8 +802,76 @@ static const struct asus_attr_group armoury_attr_groups[] = { { &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS }, }; +/** + * is_power_tunable_attr - Determines if an attribute is a power-related tunable + * @name: The name of the attribute to check + * + * This function checks if the given attribute name is related to power tuning. + * + * Return: true if the attribute is a power-related tunable, false otherwise + */ +static bool is_power_tunable_attr(const char *name) +{ + static const char * const power_tunable_attrs[] = { + ATTR_PPT_PL1_SPL, ATTR_PPT_PL2_SPPT, + ATTR_PPT_PL3_FPPT, ATTR_PPT_APU_SPPT, + ATTR_PPT_PLATFORM_SPPT, ATTR_NV_DYNAMIC_BOOST, + ATTR_NV_TEMP_TARGET, ATTR_NV_BASE_TGP, + ATTR_NV_TGP + }; + + for (unsigned int i = 0; i < ARRAY_SIZE(power_tunable_attrs); i++) { + if (!strcmp(name, power_tunable_attrs[i])) + return true; + } + + return false; +} + +/** + * has_valid_limit - Checks if a power-related attribute has a valid limit value + * @name: The name of the attribute to check + * @limits: Pointer to the power_limits structure containing limit values + * + * This function checks if a power-related attribute has a valid limit value. + * It returns false if limits is NULL or if the corresponding limit value is zero. + * + * Return: true if the attribute has a valid limit value, false otherwise + */ +static bool has_valid_limit(const char *name, const struct power_limits *limits) +{ + u32 limit_value = 0; + + if (!limits) + return false; + + if (!strcmp(name, ATTR_PPT_PL1_SPL)) + limit_value = limits->ppt_pl1_spl_max; + else if (!strcmp(name, ATTR_PPT_PL2_SPPT)) + limit_value = limits->ppt_pl2_sppt_max; + else if (!strcmp(name, ATTR_PPT_PL3_FPPT)) + limit_value = limits->ppt_pl3_fppt_max; + else if (!strcmp(name, ATTR_PPT_APU_SPPT)) + limit_value = limits->ppt_apu_sppt_max; + else if (!strcmp(name, ATTR_PPT_PLATFORM_SPPT)) + limit_value = limits->ppt_platform_sppt_max; + else if (!strcmp(name, ATTR_NV_DYNAMIC_BOOST)) + limit_value = limits->nv_dynamic_boost_max; + else if (!strcmp(name, ATTR_NV_TEMP_TARGET)) + limit_value = limits->nv_temp_target_max; + else if (!strcmp(name, ATTR_NV_BASE_TGP) || + !strcmp(name, ATTR_NV_TGP)) + limit_value = limits->nv_tgp_max; + + return limit_value > 0; +} + static int asus_fw_attr_add(void) { + const struct rog_tunables *const ac_rog_tunables = asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]; + const struct power_limits *limits; + bool should_create; + const char *name; int err, i; asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), @@ -786,12 +928,28 @@ static int asus_fw_attr_add(void) if (!armoury_has_devstate(armoury_attr_groups[i].wmi_devid)) continue; - err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, - armoury_attr_groups[i].attr_group); - if (err) { - pr_err("Failed to create sysfs-group for %s\n", - armoury_attr_groups[i].attr_group->name); - goto err_remove_groups; + /* Always create by default, unless PPT is not present */ + should_create = true; + name = armoury_attr_groups[i].attr_group->name; + + /* Check if this is a power-related tunable requiring limits */ + if (ac_rog_tunables && ac_rog_tunables->power_limits && + is_power_tunable_attr(name)) { + limits = ac_rog_tunables->power_limits; + /* Check only AC: if not present then DC won't be either */ + should_create = has_valid_limit(name, limits); + if (!should_create) + pr_debug("Missing max value for tunable %s\n", name); + } + + if (should_create) { + err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, + armoury_attr_groups[i].attr_group); + if (err) { + pr_err("Failed to create sysfs-group for %s\n", + armoury_attr_groups[i].attr_group->name); + goto err_remove_groups; + } } } @@ -820,6 +978,132 @@ fail_class_get: /* Init / exit ****************************************************************/ +/* Set up the min/max and defaults for ROG tunables */ +static void init_rog_tunables(void) +{ + const struct power_limits *ac_limits, *dc_limits; + struct rog_tunables *ac_rog_tunables = NULL, *dc_rog_tunables = NULL; + const struct power_data *power_data; + const struct dmi_system_id *dmi_id; + + /* Match the system against the power_limits table */ + dmi_id = dmi_first_match(power_limits); + if (!dmi_id) { + pr_warn("No matching power limits found for this system\n"); + return; + } + + /* Get the power data for this system */ + power_data = dmi_id->driver_data; + if (!power_data) { + pr_info("No power data available for this system\n"); + return; + } + + /* Initialize AC power tunables */ + ac_limits = power_data->ac_data; + if (ac_limits) { + ac_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]), + GFP_KERNEL); + if (!ac_rog_tunables) + goto err_nomem; + + asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC] = ac_rog_tunables; + ac_rog_tunables->power_limits = ac_limits; + + /* Set initial AC values */ + ac_rog_tunables->ppt_pl1_spl = + ac_limits->ppt_pl1_spl_def ? + ac_limits->ppt_pl1_spl_def : + ac_limits->ppt_pl1_spl_max; + + ac_rog_tunables->ppt_pl2_sppt = + ac_limits->ppt_pl2_sppt_def ? + ac_limits->ppt_pl2_sppt_def : + ac_limits->ppt_pl2_sppt_max; + + ac_rog_tunables->ppt_pl3_fppt = + ac_limits->ppt_pl3_fppt_def ? + ac_limits->ppt_pl3_fppt_def : + ac_limits->ppt_pl3_fppt_max; + + ac_rog_tunables->ppt_apu_sppt = + ac_limits->ppt_apu_sppt_def ? + ac_limits->ppt_apu_sppt_def : + ac_limits->ppt_apu_sppt_max; + + ac_rog_tunables->ppt_platform_sppt = + ac_limits->ppt_platform_sppt_def ? + ac_limits->ppt_platform_sppt_def : + ac_limits->ppt_platform_sppt_max; + + ac_rog_tunables->nv_dynamic_boost = + ac_limits->nv_dynamic_boost_max; + ac_rog_tunables->nv_temp_target = + ac_limits->nv_temp_target_max; + ac_rog_tunables->nv_tgp = ac_limits->nv_tgp_max; + + pr_debug("AC power limits initialized for %s\n", dmi_id->matches[0].substr); + } else { + pr_debug("No AC PPT limits defined\n"); + } + + /* Initialize DC power tunables */ + dc_limits = power_data->dc_data; + if (dc_limits) { + dc_rog_tunables = kzalloc(sizeof(*asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]), + GFP_KERNEL); + if (!dc_rog_tunables) { + kfree(ac_rog_tunables); + goto err_nomem; + } + + asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC] = dc_rog_tunables; + dc_rog_tunables->power_limits = dc_limits; + + /* Set initial DC values */ + dc_rog_tunables->ppt_pl1_spl = + dc_limits->ppt_pl1_spl_def ? + dc_limits->ppt_pl1_spl_def : + dc_limits->ppt_pl1_spl_max; + + dc_rog_tunables->ppt_pl2_sppt = + dc_limits->ppt_pl2_sppt_def ? + dc_limits->ppt_pl2_sppt_def : + dc_limits->ppt_pl2_sppt_max; + + dc_rog_tunables->ppt_pl3_fppt = + dc_limits->ppt_pl3_fppt_def ? + dc_limits->ppt_pl3_fppt_def : + dc_limits->ppt_pl3_fppt_max; + + dc_rog_tunables->ppt_apu_sppt = + dc_limits->ppt_apu_sppt_def ? + dc_limits->ppt_apu_sppt_def : + dc_limits->ppt_apu_sppt_max; + + dc_rog_tunables->ppt_platform_sppt = + dc_limits->ppt_platform_sppt_def ? + dc_limits->ppt_platform_sppt_def : + dc_limits->ppt_platform_sppt_max; + + dc_rog_tunables->nv_dynamic_boost = + dc_limits->nv_dynamic_boost_max; + dc_rog_tunables->nv_temp_target = + dc_limits->nv_temp_target_max; + dc_rog_tunables->nv_tgp = dc_limits->nv_tgp_max; + + pr_debug("DC power limits initialized for %s\n", dmi_id->matches[0].substr); + } else { + pr_debug("No DC PPT limits defined\n"); + } + + return; + +err_nomem: + pr_err("Failed to allocate memory for tunables\n"); +} + static int __init asus_fw_init(void) { char *wmi_uid; @@ -835,6 +1119,9 @@ static int __init asus_fw_init(void) if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) return -ENODEV; + init_rog_tunables(); + + /* Must always be last step to ensure data is available */ return asus_fw_attr_add(); } @@ -857,6 +1144,9 @@ static void __exit asus_fw_exit(void) sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); kset_unregister(asus_armoury.fw_attr_kset); device_destroy(&firmware_attributes_class, MKDEV(0, 0)); + + kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_AC]); + kfree(asus_armoury.rog_tunables[ASUS_ROG_TUNABLE_DC]); } module_init(asus_fw_init); diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h index 3a2a674a1b55..548c66c590a6 100644 --- a/drivers/platform/x86/asus-armoury.h +++ b/drivers/platform/x86/asus-armoury.h @@ -8,6 +8,7 @@ #ifndef _ASUS_ARMOURY_H_ #define _ASUS_ARMOURY_H_ +#include #include #include #include @@ -197,4 +198,1297 @@ ssize_t armoury_attr_uint_show(struct kobject *kobj, struct kobj_attribute *attr .name = _fsname, .attrs = _attrname##_attrs \ } +#define ASUS_ATTR_GROUP_INT_VALUE_ONLY_RO(_attrname, _fsname, _wmi, _dispname) \ + ASUS_WMI_SHOW_INT(_attrname##_current_value, _wmi); \ + static struct kobj_attribute attr_##_attrname##_current_value = \ + __ASUS_ATTR_RO(_attrname, current_value); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, int_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_type.attr, NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* + * ROG PPT attributes need a little different in setup as they + * require rog_tunables members. + */ + +#define __ROG_TUNABLE_SHOW(_prop, _attrname, _val) \ + static ssize_t _attrname##_##_prop##_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + return sysfs_emit(buf, "%d\n", tunables->power_limits->_val); \ + } \ + static struct kobj_attribute attr_##_attrname##_##_prop = \ + __ASUS_ATTR_RO(_attrname, _prop) + +#define __ROG_TUNABLE_SHOW_DEFAULT(_attrname) \ + static ssize_t _attrname##_default_value_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + return sysfs_emit( \ + buf, "%d\n", \ + tunables->power_limits->_attrname##_def ? \ + tunables->power_limits->_attrname##_def : \ + tunables->power_limits->_attrname##_max); \ + } \ + static struct kobj_attribute attr_##_attrname##_default_value = \ + __ASUS_ATTR_RO(_attrname, default_value) + +#define __ROG_TUNABLE_RW(_attr, _wmi) \ + static ssize_t _attr##_current_value_store( \ + struct kobject *kobj, struct kobj_attribute *attr, \ + const char *buf, size_t count) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables || !tunables->power_limits) \ + return -ENODEV; \ + \ + if (tunables->power_limits->_attr##_min == \ + tunables->power_limits->_attr##_max) \ + return -EINVAL; \ + \ + return armoury_attr_uint_store(kobj, attr, buf, count, \ + tunables->power_limits->_attr##_min, \ + tunables->power_limits->_attr##_max, \ + &tunables->_attr, _wmi); \ + } \ + static ssize_t _attr##_current_value_show( \ + struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ + { \ + struct rog_tunables *tunables = get_current_tunables(); \ + \ + if (!tunables) \ + return -ENODEV; \ + \ + return sysfs_emit(buf, "%u\n", tunables->_attr); \ + } \ + static struct kobj_attribute attr_##_attr##_current_value = \ + __ASUS_ATTR_RW(_attr, current_value) + +#define ASUS_ATTR_GROUP_ROG_TUNABLE(_attrname, _fsname, _wmi, _dispname) \ + __ROG_TUNABLE_RW(_attrname, _wmi); \ + __ROG_TUNABLE_SHOW_DEFAULT(_attrname); \ + __ROG_TUNABLE_SHOW(min_value, _attrname, _attrname##_min); \ + __ROG_TUNABLE_SHOW(max_value, _attrname, _attrname##_max); \ + __ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1); \ + __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ + static struct kobj_attribute attr_##_attrname##_type = \ + __ASUS_ATTR_RO_AS(type, int_type_show); \ + static struct attribute *_attrname##_attrs[] = { \ + &attr_##_attrname##_current_value.attr, \ + &attr_##_attrname##_default_value.attr, \ + &attr_##_attrname##_min_value.attr, \ + &attr_##_attrname##_max_value.attr, \ + &attr_##_attrname##_scalar_increment.attr, \ + &attr_##_attrname##_display_name.attr, \ + &attr_##_attrname##_type.attr, \ + NULL \ + }; \ + static const struct attribute_group _attrname##_attr_group = { \ + .name = _fsname, .attrs = _attrname##_attrs \ + } + +/* Default is always the maximum value unless *_def is specified */ +struct power_limits { + u8 ppt_pl1_spl_min; + u8 ppt_pl1_spl_def; + u8 ppt_pl1_spl_max; + u8 ppt_pl2_sppt_min; + u8 ppt_pl2_sppt_def; + u8 ppt_pl2_sppt_max; + u8 ppt_pl3_fppt_min; + u8 ppt_pl3_fppt_def; + u8 ppt_pl3_fppt_max; + u8 ppt_apu_sppt_min; + u8 ppt_apu_sppt_def; + u8 ppt_apu_sppt_max; + u8 ppt_platform_sppt_min; + u8 ppt_platform_sppt_def; + u8 ppt_platform_sppt_max; + /* Nvidia GPU specific, default is always max */ + u8 nv_dynamic_boost_def; // unused. exists for macro + u8 nv_dynamic_boost_min; + u8 nv_dynamic_boost_max; + u8 nv_temp_target_def; // unused. exists for macro + u8 nv_temp_target_min; + u8 nv_temp_target_max; + u8 nv_tgp_def; // unused. exists for macro + u8 nv_tgp_min; + u8 nv_tgp_max; +}; + +struct power_data { + const struct power_limits *ac_data; + const struct power_limits *dc_data; + bool requires_fan_curve; +}; + +/* + * For each available attribute there must be a min and a max. + * _def is not required and will be assumed to be default == max if missing. + */ +static const struct dmi_system_id power_limits[] = { + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA401W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 75, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507N"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80 + }, + .dc_data = NULL, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507X"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA507Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 105, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 15, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA607P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 30, + .ppt_pl2_sppt_def = 115, + .ppt_pl2_sppt_max = 135, + .ppt_pl3_fppt_min = 30, + .ppt_pl3_fppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_def = 60, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 25, + .ppt_pl3_fppt_max = 80, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA608WI"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 90, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 90, + .ppt_pl2_sppt_max = 90, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 90, + .ppt_pl3_fppt_max = 90, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 45, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 65, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617NS"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 120, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_max = 35, + .ppt_platform_sppt_min = 45, + .ppt_platform_sppt_max = 100, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617NT"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 45, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 50, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FA617XS"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 120, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_max = 35, + .ppt_platform_sppt_min = 45, + .ppt_platform_sppt_max = 100, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507VI"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507VV"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_def = 115, + .ppt_pl1_spl_max = 135, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "FX507Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 15, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 60, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA401Q"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + }, + }, + { + .matches = { + // This model is full AMD. No Nvidia dGPU. + DMI_MATCH(DMI_BOARD_NAME, "GA402R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_apu_sppt_min = 15, + .ppt_apu_sppt_max = 80, + .ppt_platform_sppt_min = 30, + .ppt_platform_sppt_max = 115, + }, + .dc_data = &(struct power_limits) { + .ppt_apu_sppt_min = 25, + .ppt_apu_sppt_def = 30, + .ppt_apu_sppt_max = 45, + .ppt_platform_sppt_min = 40, + .ppt_platform_sppt_max = 60, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA402X"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_def = 65, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 65, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA503R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 65, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 60, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA605W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU603Z"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 60, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 40, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + } + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU604V"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 65, + .ppt_pl1_spl_max = 120, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_max = 150, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 40, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605CW"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 45, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 56, + .ppt_pl2_sppt_max = 110, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 80, + .nv_tgp_def = 90, + .nv_tgp_max = 110, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 32, + .ppt_pl2_sppt_max = 110, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605CX"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 45, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 56, + .ppt_pl2_sppt_max = 110, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 7, + .nv_temp_target_max = 87, + .nv_tgp_min = 95, + .nv_tgp_def = 100, + .nv_tgp_max = 110, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 85, + .ppt_pl2_sppt_min = 32, + .ppt_pl2_sppt_max = 110, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 38, + .ppt_pl2_sppt_max = 53, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV301Q"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV301R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 45, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 54, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 35, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV601R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 35, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 100, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 80, + .ppt_pl3_fppt_max = 125, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 28, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 54, + .ppt_pl2_sppt_max = 60, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 80, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GV601V"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 110, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 40, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 40, + .ppt_pl2_sppt_max = 60, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GX650P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 110, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 135, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_def = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_def = 35, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_def = 42, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513I"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + /* Yes this laptop is very limited */ + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 80, + }, + .dc_data = NULL, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513QM"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + /* Yes this laptop is very limited */ + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 100, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 190, + }, + .dc_data = NULL, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G513R"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 35, + .ppt_pl1_spl_max = 90, + .ppt_pl2_sppt_min = 54, + .ppt_pl2_sppt_max = 100, + .ppt_pl3_fppt_min = 54, + .ppt_pl3_fppt_max = 125, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 50, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 50, + .ppt_pl3_fppt_min = 28, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G614J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G634J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G713PV"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 120, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 65, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 130, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 75, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G733C"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 170, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 35, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G733P"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 30, + .ppt_pl1_spl_def = 100, + .ppt_pl1_spl_max = 130, + .ppt_pl2_sppt_min = 65, + .ppt_pl2_sppt_def = 125, + .ppt_pl2_sppt_max = 130, + .ppt_pl3_fppt_min = 65, + .ppt_pl3_fppt_def = 125, + .ppt_pl3_fppt_max = 130, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 65, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 65, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 75, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G814J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 140, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "G834J"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 28, + .ppt_pl1_spl_max = 140, + .ppt_pl2_sppt_min = 28, + .ppt_pl2_sppt_max = 175, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 25, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 55, + .ppt_pl2_sppt_min = 25, + .ppt_pl2_sppt_max = 70, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + .requires_fan_curve = true, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "H7606W"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 15, + .ppt_pl1_spl_max = 80, + .ppt_pl2_sppt_min = 35, + .ppt_pl2_sppt_max = 80, + .ppt_pl3_fppt_min = 35, + .ppt_pl3_fppt_max = 80, + .nv_dynamic_boost_min = 5, + .nv_dynamic_boost_max = 20, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + .nv_tgp_min = 55, + .nv_tgp_max = 85, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 25, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 31, + .ppt_pl2_sppt_max = 44, + .ppt_pl3_fppt_min = 45, + .ppt_pl3_fppt_max = 65, + .nv_temp_target_min = 75, + .nv_temp_target_max = 87, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 43, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_max = 53, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 15, + .ppt_pl1_spl_max = 25, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_def = 20, + .ppt_pl2_sppt_max = 30, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_def = 25, + .ppt_pl3_fppt_max = 35, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC72"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 30, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_max = 43, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_max = 53, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 17, + .ppt_pl1_spl_max = 25, + .ppt_pl2_sppt_min = 15, + .ppt_pl2_sppt_def = 24, + .ppt_pl2_sppt_max = 30, + .ppt_pl3_fppt_min = 15, + .ppt_pl3_fppt_def = 30, + .ppt_pl3_fppt_max = 35, + }, + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC73XA"), + }, + .driver_data = &(struct power_data) { + .ac_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 14, + .ppt_pl2_sppt_max = 45, + .ppt_pl3_fppt_min = 19, + .ppt_pl3_fppt_max = 55, + }, + .dc_data = &(struct power_limits) { + .ppt_pl1_spl_min = 7, + .ppt_pl1_spl_def = 17, + .ppt_pl1_spl_max = 35, + .ppt_pl2_sppt_min = 13, + .ppt_pl2_sppt_def = 21, + .ppt_pl2_sppt_max = 45, + .ppt_pl3_fppt_min = 19, + .ppt_pl3_fppt_def = 26, + .ppt_pl3_fppt_max = 55, + }, + }, + }, + {} +}; + #endif /* _ASUS_ARMOURY_H_ */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index e7c95e9d29db..419491d4abca 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -139,6 +139,9 @@ #define ASUS_WMI_DEVID_APU_MEM 0x000600C1 +#define ASUS_WMI_DEVID_DGPU_BASE_TGP 0x00120099 +#define ASUS_WMI_DEVID_DGPU_SET_TGP 0x00120098 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 32e3fee88a4ac183541b478f5bc94084ea76436c Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 11 Nov 2025 14:11:24 +0100 Subject: platform/x86: wmi: Remove extern keyword from prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The external function definitions do not need the "extern" keyword. Remove it to silence the associated checkpatch warnings. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251111131125.3379-4-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 10751c8e5e6a..665ea7dc8a92 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -36,13 +36,10 @@ struct wmi_device { */ #define to_wmi_device(device) container_of_const(device, struct wmi_device, dev) -extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, - u8 instance, u32 method_id, - const struct acpi_buffer *in, - struct acpi_buffer *out); +acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, + const struct acpi_buffer *in, struct acpi_buffer *out); -extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, - u8 instance); +union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); @@ -81,9 +78,9 @@ struct wmi_driver { */ #define to_wmi_driver(drv) container_of_const(drv, struct wmi_driver, driver) -extern int __must_check __wmi_driver_register(struct wmi_driver *driver, - struct module *owner); -extern void wmi_driver_unregister(struct wmi_driver *driver); +int __must_check __wmi_driver_register(struct wmi_driver *driver, struct module *owner); + +void wmi_driver_unregister(struct wmi_driver *driver); /** * wmi_driver_register() - Helper macro to register a WMI driver -- cgit v1.2.3 From 8bffbfdc01dff26f17f8b382266e71d48e63c5e9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 22 Oct 2025 15:51:32 +0200 Subject: reset: remove legacy reset lookup code There are no more users of this code. Let's remove the exported symbols and the implementation from reset core. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski [p.zabel@pengutronix.de: folded in 8e6ec20e-8965-4b42-99fc-0462269ff2f1@paulmck-laptop] Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 124 +-------------------------------------- include/linux/reset-controller.h | 33 ----------- 2 files changed, 3 insertions(+), 154 deletions(-) (limited to 'include/linux') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 22f67fc77ae5..20cd50804ba1 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -25,9 +25,6 @@ static DEFINE_MUTEX(reset_list_mutex); static LIST_HEAD(reset_controller_list); -static DEFINE_MUTEX(reset_lookup_mutex); -static LIST_HEAD(reset_lookup_list); - /* Protects reset_gpio_lookup_list */ static DEFINE_MUTEX(reset_gpio_lookup_mutex); static LIST_HEAD(reset_gpio_lookup_list); @@ -190,33 +187,6 @@ int devm_reset_controller_register(struct device *dev, } EXPORT_SYMBOL_GPL(devm_reset_controller_register); -/** - * reset_controller_add_lookup - register a set of lookup entries - * @lookup: array of reset lookup entries - * @num_entries: number of entries in the lookup array - */ -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ - struct reset_control_lookup *entry; - unsigned int i; - - mutex_lock(&reset_lookup_mutex); - for (i = 0; i < num_entries; i++) { - entry = &lookup[i]; - - if (!entry->dev_id || !entry->provider) { - pr_warn("%s(): reset lookup entry badly specified, skipping\n", - __func__); - continue; - } - - list_add_tail(&entry->list, &reset_lookup_list); - } - mutex_unlock(&reset_lookup_mutex); -} -EXPORT_SYMBOL_GPL(reset_controller_add_lookup); - static inline struct reset_control_array * rstc_to_array(struct reset_control *rstc) { return container_of(rstc, struct reset_control_array, base); @@ -1081,75 +1051,12 @@ out_put: } EXPORT_SYMBOL_GPL(__of_reset_control_get); -static struct reset_controller_dev * -__reset_controller_by_name(const char *name) -{ - struct reset_controller_dev *rcdev; - - lockdep_assert_held(&reset_list_mutex); - - list_for_each_entry(rcdev, &reset_controller_list, list) { - if (!rcdev->dev) - continue; - - if (!strcmp(name, dev_name(rcdev->dev))) - return rcdev; - } - - return NULL; -} - -static struct reset_control * -__reset_control_get_from_lookup(struct device *dev, const char *con_id, - enum reset_control_flags flags) -{ - bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; - const struct reset_control_lookup *lookup; - struct reset_controller_dev *rcdev; - const char *dev_id = dev_name(dev); - struct reset_control *rstc = NULL; - - mutex_lock(&reset_lookup_mutex); - - list_for_each_entry(lookup, &reset_lookup_list, list) { - if (strcmp(lookup->dev_id, dev_id)) - continue; - - if ((!con_id && !lookup->con_id) || - ((con_id && lookup->con_id) && - !strcmp(con_id, lookup->con_id))) { - mutex_lock(&reset_list_mutex); - rcdev = __reset_controller_by_name(lookup->provider); - if (!rcdev) { - mutex_unlock(&reset_list_mutex); - mutex_unlock(&reset_lookup_mutex); - /* Reset provider may not be ready yet. */ - return ERR_PTR(-EPROBE_DEFER); - } - - flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL; - - rstc = __reset_control_get_internal(rcdev, - lookup->index, - flags); - mutex_unlock(&reset_list_mutex); - break; - } - } - - mutex_unlock(&reset_lookup_mutex); - - if (!rstc) - return optional ? NULL : ERR_PTR(-ENOENT); - - return rstc; -} - struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags) { bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED; bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED; + bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; if (WARN_ON(shared && acquired)) return ERR_PTR(-EINVAL); @@ -1157,7 +1064,7 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id, if (dev->of_node) return __of_reset_control_get(dev->of_node, id, index, flags); - return __reset_control_get_from_lookup(dev, id, flags); + return optional ? NULL : ERR_PTR(-ENOENT); } EXPORT_SYMBOL_GPL(__reset_control_get); @@ -1492,31 +1399,6 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) } EXPORT_SYMBOL_GPL(devm_reset_control_array_get); -static int reset_control_get_count_from_lookup(struct device *dev) -{ - const struct reset_control_lookup *lookup; - const char *dev_id; - int count = 0; - - if (!dev) - return -EINVAL; - - dev_id = dev_name(dev); - mutex_lock(&reset_lookup_mutex); - - list_for_each_entry(lookup, &reset_lookup_list, list) { - if (!strcmp(lookup->dev_id, dev_id)) - count++; - } - - mutex_unlock(&reset_lookup_mutex); - - if (count == 0) - count = -ENOENT; - - return count; -} - /** * reset_control_get_count - Count number of resets available with a device * @@ -1530,6 +1412,6 @@ int reset_control_get_count(struct device *dev) if (dev->of_node) return of_reset_control_get_count(dev->of_node); - return reset_control_get_count_from_lookup(dev); + return -ENOENT; } EXPORT_SYMBOL_GPL(reset_control_get_count); diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 357df16ede32..46514cb1b9e0 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -26,31 +26,6 @@ struct module; struct device_node; struct of_phandle_args; -/** - * struct reset_control_lookup - represents a single lookup entry - * - * @list: internal list of all reset lookup entries - * @provider: name of the reset controller device controlling this reset line - * @index: ID of the reset controller in the reset controller device - * @dev_id: name of the device associated with this reset line - * @con_id: name of the reset line (can be NULL) - */ -struct reset_control_lookup { - struct list_head list; - const char *provider; - unsigned int index; - const char *dev_id; - const char *con_id; -}; - -#define RESET_LOOKUP(_provider, _index, _dev_id, _con_id) \ - { \ - .provider = _provider, \ - .index = _index, \ - .dev_id = _dev_id, \ - .con_id = _con_id, \ - } - /** * struct reset_controller_dev - reset controller entity that might * provide multiple reset controls @@ -90,9 +65,6 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev); struct device; int devm_reset_controller_register(struct device *dev, struct reset_controller_dev *rcdev); - -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries); #else static inline int reset_controller_register(struct reset_controller_dev *rcdev) { @@ -108,11 +80,6 @@ static inline int devm_reset_controller_register(struct device *dev, { return 0; } - -static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ -} #endif #endif -- cgit v1.2.3 From f3d8b64ee46c9b4b0b82b1a4642027728bac95b8 Mon Sep 17 00:00:00 2001 From: Encrow Thorne Date: Mon, 10 Nov 2025 14:10:37 +0800 Subject: reset: fix BIT macro reference RESET_CONTROL_FLAGS_BIT_* macros use BIT(), but reset.h does not include bits.h. This causes compilation errors when including reset.h standalone. Include bits.h to make reset.h self-contained. Suggested-by: Troy Mitchell Reviewed-by: Troy Mitchell Reviewed-by: Philipp Zabel Signed-off-by: Encrow Thorne Signed-off-by: Philipp Zabel --- include/linux/reset.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 840d75d172f6..44f9e3415f92 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -2,6 +2,7 @@ #ifndef _LINUX_RESET_H_ #define _LINUX_RESET_H_ +#include #include #include #include -- cgit v1.2.3 From 4edf654be5471659e3260be0a557eaa2ece668ab Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Wed, 12 Nov 2025 16:27:06 +0000 Subject: phy: add new phy_notify_state() api Add a new phy_notify_state() api that notifies and configures a phy for a given state transition. This is intended to be used by phy drivers which need to do some runtime configuration of parameters that can't be handled by phy_calibrate() or phy_power_{on|off}(). The first usage of this API is in the Samsung UFS phy that needs to issue some register writes when entering and exiting the hibernate link state. Signed-off-by: Peter Griffin Reviewed-by: Neil Armstrong Link: https://patch.msgid.link/20251112-phy-notify-pmstate-v5-1-39df622d8fcb@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/phy-core.c | 25 +++++++++++++++++++++++++ include/linux/phy/phy.h | 19 +++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'include/linux') diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 04a5a34e7a95..60be8af984bf 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -520,6 +520,31 @@ int phy_notify_disconnect(struct phy *phy, int port) } EXPORT_SYMBOL_GPL(phy_notify_disconnect); +/** + * phy_notify_state() - phy state notification + * @phy: the PHY returned by phy_get() + * @state: the PHY state + * + * Notify the PHY of a state transition. Used to notify and + * configure the PHY accordingly. + * + * Returns: %0 if successful, a negative error code otherwise + */ +int phy_notify_state(struct phy *phy, union phy_notify state) +{ + int ret; + + if (!phy || !phy->ops->notify_phystate) + return 0; + + mutex_lock(&phy->mutex); + ret = phy->ops->notify_phystate(phy, state); + mutex_unlock(&phy->mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(phy_notify_state); + /** * phy_configure() - Changes the phy parameters * @phy: the phy returned by phy_get() diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 13add0c2c407..2af0d01ebb39 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -53,6 +53,15 @@ enum phy_media { PHY_MEDIA_DAC, }; +enum phy_ufs_state { + PHY_UFS_HIBERN8_ENTER, + PHY_UFS_HIBERN8_EXIT, +}; + +union phy_notify { + enum phy_ufs_state ufs_state; +}; + /** * union phy_configure_opts - Opaque generic phy configuration * @@ -83,6 +92,7 @@ union phy_configure_opts { * @set_speed: set the speed of the phy (optional) * @reset: resetting the phy * @calibrate: calibrate the phy + * @notify_phystate: notify and configure the phy for a particular state * @release: ops to be performed while the consumer relinquishes the PHY * @owner: the module owner containing the ops */ @@ -132,6 +142,7 @@ struct phy_ops { int (*connect)(struct phy *phy, int port); int (*disconnect)(struct phy *phy, int port); + int (*notify_phystate)(struct phy *phy, union phy_notify state); void (*release)(struct phy *phy); struct module *owner; }; @@ -255,6 +266,7 @@ int phy_reset(struct phy *phy); int phy_calibrate(struct phy *phy); int phy_notify_connect(struct phy *phy, int port); int phy_notify_disconnect(struct phy *phy, int port); +int phy_notify_state(struct phy *phy, union phy_notify state); static inline int phy_get_bus_width(struct phy *phy) { return phy->attrs.bus_width; @@ -412,6 +424,13 @@ static inline int phy_notify_disconnect(struct phy *phy, int index) return -ENOSYS; } +static inline int phy_notify_state(struct phy *phy, union phy_notify state) +{ + if (!phy) + return 0; + return -ENOSYS; +} + static inline int phy_configure(struct phy *phy, union phy_configure_opts *opts) { -- cgit v1.2.3 From 01ba82702957225218c54c06ad2c2d468b83f510 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Chundru Date: Sat, 1 Nov 2025 09:29:33 +0530 Subject: PCI: Add .assert_perst() to control PCIe PERST# Controller driver probes first, enables link training and scans the bus. When the PCI bridge is found, its child DT nodes will be scanned and pwrctrl devices will be created if needed. By the time pwrctrl driver probe gets called, link training is already enabled by controller driver. Certain devices like TC9563, which uses the PCI pwrctl framework, need to configure the device before the PCIe link is up. As the controller driver already enables link training as part of its probe, the moment device is powered on, controller and device participate in link training and link can come up immediately and may not have time to configure the device. So we need to stop the link training by using assert_perst() by asserting PERST# and de-asserting PERST# after device is configured. Signed-off-by: Krishna Chaitanya Chundru Signed-off-by: Bjorn Helgaas Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251101-tc9563-v9-2-de3429f7787a@oss.qualcomm.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..ed5dac663e96 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -829,6 +829,7 @@ struct pci_ops { void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where); int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); + int (*assert_perst)(struct pci_bus *bus, bool assert); }; /* -- cgit v1.2.3 From d85b56af22f371409cbf667bab26f938e6528d2e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:30 +0200 Subject: efi: Fix trailing whitespace in header file Resolve an issue with the coding style. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 0b9eb3d2ff97..60e994096e20 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -290,7 +290,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor, unsigned long *data_size, void *data); typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor); -typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, +typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); -- cgit v1.2.3 From 17029cdd8f9d0182a6499e0b7bfc6391e8463091 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:33 +0200 Subject: efi/libstub: gop: Add support for reading EDID Add support for EFI_EDID_DISCOVERED_PROTOCOL and EFI_EDID_ACTIVE_PROTOCOL as defined in UEFI 2.8, sec 12.9. Define GUIDs and data structures in the rsp header files. In the GOP setup function, read the EDID of the primary GOP device. First try EFI_EDID_ACTIVE_PROTOCOL, which supports user-specified EDID data. Or else try EFI_EDID_DISCOVERED_PROTOCOL, which returns the display device's native EDID. If no EDID could be retrieved, clear the storage. Rename efi_setup_gop() to efi_setup_graphics() to reflect the changes Let callers pass an optional instance of struct edid_data, if they are interested. While screen_info and edid_info come from the same device handle, they should be considered indendent data. The former refers to the graphics mode, the latter refers to the display device. GOP devices might not provide both. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/efi-stub.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 31 +++++++++++++++++++++++++++- drivers/firmware/efi/libstub/gop.c | 36 ++++++++++++++++++++++++++++++++- drivers/firmware/efi/libstub/x86-stub.c | 2 +- include/linux/efi.h | 2 ++ 5 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 874f63b4a383..9cb814c5ba1b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -56,7 +56,7 @@ static struct screen_info *setup_graphics(void) { struct screen_info *si, tmp = {}; - if (efi_setup_gop(&tmp) != EFI_SUCCESS) + if (efi_setup_graphics(&tmp, NULL) != EFI_SUCCESS) return NULL; si = alloc_screen_info(); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index f5ba032863a9..b2fb0c3fa721 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -34,6 +34,9 @@ #define EFI_ALLOC_LIMIT ULONG_MAX #endif +struct edid_info; +struct screen_info; + extern bool efi_no5lvl; extern bool efi_nochunk; extern bool efi_nokaslr; @@ -578,6 +581,32 @@ union efi_graphics_output_protocol { } mixed_mode; }; +typedef union efi_edid_discovered_protocol efi_edid_discovered_protocol_t; + +union efi_edid_discovered_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + +typedef union efi_edid_active_protocol efi_edid_active_protocol_t; + +union efi_edid_active_protocol { + struct { + u32 size_of_edid; + u8 *edid; + }; + struct { + u32 size_of_edid; + u32 edid; + } mixed_mode; +}; + typedef union { struct { u32 revision; @@ -1085,7 +1114,7 @@ efi_status_t efi_parse_options(char const *cmdline); void efi_parse_option_graphics(char *option); -efi_status_t efi_setup_gop(struct screen_info *si); +efi_status_t efi_setup_graphics(struct screen_info *si, struct edid_info *edid); efi_status_t handle_cmdline_files(efi_loaded_image_t *image, const efi_char16_t *optstr, diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c index 02459ef0f18c..72d74436a7a4 100644 --- a/drivers/firmware/efi/libstub/gop.c +++ b/drivers/firmware/efi/libstub/gop.c @@ -12,6 +12,7 @@ #include #include #include +#include