From 8d5976089c97a4beeeda4de59e2fba9862946893 Mon Sep 17 00:00:00 2001
From: Xiang wangx <wangxiang@cdjrlc.com>
Date: Mon, 6 Jun 2022 10:23:13 +0800
Subject: platform/chrome: cros_ec_commands: Fix syntax errors in comments

Delete the redundant word 'using'.

Signed-off-by: Xiang wangx <wangxiang@cdjrlc.com>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606022313.22912-1-wangxiang@cdjrlc.com
---
 include/linux/platform_data/cros_ec_commands.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8cfa8cfca77e..e59f51c41a1c 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -787,7 +787,7 @@ struct ec_host_response {
  *
  * Packets always start with a request or response header.  They are followed
  * by data_len bytes of data.  If the data_crc_present flag is set, the data
- * bytes are followed by a CRC-8 of that data, using using x^8 + x^2 + x + 1
+ * bytes are followed by a CRC-8 of that data, using x^8 + x^2 + x + 1
  * polynomial.
  *
  * Host algorithm when sending a request q:
-- 
cgit v1.2.3


From fc602b4f692cb83c937b5f79628bca32b60c4402 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Sat, 4 Jun 2022 12:32:50 +0100
Subject: mtd: spinand: Add support for ATO25D1GA

Add support for the ATO25D1GA SPI NAND flash.

Datasheet:
- https://atta.szlcsc.com/upload/public/pdf/source/20191212/C469320_04599D67B03B078044EB65FF5AEDDDE9.pdf

Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220604113250.4745-1-aidanmacdonald.0x0@gmail.com
---
 drivers/mtd/nand/spi/Makefile |  2 +-
 drivers/mtd/nand/spi/ato.c    | 86 +++++++++++++++++++++++++++++++++++++++++++
 drivers/mtd/nand/spi/core.c   |  1 +
 include/linux/mtd/spinand.h   |  1 +
 4 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/ato.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 80dabe6ff0f3..b520fe634041 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
+spinand-objs := core.o ato.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/ato.c b/drivers/mtd/nand/spi/ato.c
new file mode 100644
index 000000000000..82b377c06812
--- /dev/null
+++ b/drivers/mtd/nand/spi/ato.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Aidan MacDonald
+ *
+ * Author: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+
+#define SPINAND_MFR_ATO		0x9b
+
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+
+static int ato25d1ga_ooblayout_ecc(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = (16 * section) + 8;
+	region->length = 8;
+	return 0;
+}
+
+static int ato25d1ga_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	if (section) {
+		region->offset = (16 * section);
+		region->length = 8;
+	} else {
+		/* first byte of section 0 is reserved for the BBM */
+		region->offset = 1;
+		region->length = 7;
+	}
+
+	return 0;
+}
+
+static const struct mtd_ooblayout_ops ato25d1ga_ooblayout = {
+	.ecc = ato25d1ga_ooblayout_ecc,
+	.free = ato25d1ga_ooblayout_free,
+};
+
+
+static const struct spinand_info ato_spinand_table[] = {
+	SPINAND_INFO("ATO25D1GA",
+		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0x12),
+		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
+		     NAND_ECCREQ(1, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     SPINAND_HAS_QE_BIT,
+		     SPINAND_ECCINFO(&ato25d1ga_ooblayout, NULL)),
+};
+
+static const struct spinand_manufacturer_ops ato_spinand_manuf_ops = {
+};
+
+const struct spinand_manufacturer ato_spinand_manufacturer = {
+	.id = SPINAND_MFR_ATO,
+	.name = "ATO",
+	.chips = ato_spinand_table,
+	.nchips = ARRAY_SIZE(ato_spinand_table),
+	.ops = &ato_spinand_manuf_ops,
+};
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index d5b685d1605e..9d73910a7ae8 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -927,6 +927,7 @@ static const struct nand_ops spinand_ops = {
 };
 
 static const struct spinand_manufacturer *spinand_manufacturers[] = {
+	&ato_spinand_manufacturer,
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 5584d3bb6556..6d3392a7edc6 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -260,6 +260,7 @@ struct spinand_manufacturer {
 };
 
 /* SPI NAND manufacturers */
+extern const struct spinand_manufacturer ato_spinand_manufacturer;
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
-- 
cgit v1.2.3


From 39d649602be2ceb3f8f8e98483d09e4d71133c6a Mon Sep 17 00:00:00 2001
From: Clément Léger <clement.leger@bootlin.com>
Date: Wed, 1 Jun 2022 10:17:58 +0200
Subject: of: constify of_property_check_flags() prop argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This argument is not modified and thus can be set as const.

Signed-off-by: Clément Léger <clement.leger@bootlin.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220601081801.348571-2-clement.leger@bootlin.com
---
 include/linux/of.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index f0a5d6b10c5a..d74fd82a6963 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -207,7 +207,7 @@ static inline void of_node_clear_flag(struct device_node *n, unsigned long flag)
 }
 
 #if defined(CONFIG_OF_DYNAMIC) || defined(CONFIG_SPARC)
-static inline int of_property_check_flag(struct property *p, unsigned long flag)
+static inline int of_property_check_flag(const struct property *p, unsigned long flag)
 {
 	return test_bit(flag, &p->_flags);
 }
@@ -814,7 +814,8 @@ static inline void of_node_clear_flag(struct device_node *n, unsigned long flag)
 {
 }
 
-static inline int of_property_check_flag(struct property *p, unsigned long flag)
+static inline int of_property_check_flag(const struct property *p,
+					 unsigned long flag)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b1d288d9c3c5ca28df062214656a59cf7ee370e0 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 6 Jun 2022 20:18:03 +0000
Subject: platform/chrome: cros_ec_proto: Rename cros_ec_command function

cros_ec_command() is the name of a function as well as a struct, as such
it can confuse indexing tools (like ctags). Avoid this by renaming it to
cros_ec_cmd(). Update all the callsites to use the new name.

This patch is a find-and-replace, so should not introduce any functional
changes.

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606201825.763788-3-pmalani@chromium.org
---
 drivers/mfd/cros_ec_dev.c                   |  4 +--
 drivers/platform/chrome/cros_ec_proto.c     | 22 ++++++++--------
 drivers/platform/chrome/cros_ec_typec.c     | 39 ++++++++++++++---------------
 drivers/platform/chrome/cros_usbpd_notify.c |  4 +--
 drivers/regulator/cros-ec-regulator.c       | 24 +++++++++---------
 include/linux/platform_data/cros_ec_proto.h |  2 +-
 6 files changed, 47 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 596731caf407..02d4271dfe06 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -250,8 +250,8 @@ static int ec_device_probe(struct platform_device *pdev)
 	 * The PCHG device cannot be detected by sending EC_FEATURE_GET_CMD, but
 	 * it can be detected by querying the number of peripheral chargers.
 	 */
-	retval = cros_ec_command(ec->ec_dev, 0, EC_CMD_PCHG_COUNT, NULL, 0,
-				 &pchg_count, sizeof(pchg_count));
+	retval = cros_ec_cmd(ec->ec_dev, 0, EC_CMD_PCHG_COUNT, NULL, 0,
+			     &pchg_count, sizeof(pchg_count));
 	if (retval >= 0 && pchg_count.port_count) {
 		retval = mfd_add_hotplug_devices(ec->dev,
 					cros_ec_pchg_cells,
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 13ced9d2dd71..b6bea183ee28 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -860,8 +860,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature)
 
 	if (features->flags[0] == -1U && features->flags[1] == -1U) {
 		/* features bitmap not read yet */
-		ret = cros_ec_command(ec->ec_dev, 0, EC_CMD_GET_FEATURES + ec->cmd_offset,
-				      NULL, 0, features, sizeof(*features));
+		ret = cros_ec_cmd(ec->ec_dev, 0, EC_CMD_GET_FEATURES + ec->cmd_offset,
+				  NULL, 0, features, sizeof(*features));
 		if (ret < 0) {
 			dev_warn(ec->dev, "cannot get EC features: %d\n", ret);
 			memset(features, 0, sizeof(*features));
@@ -942,7 +942,7 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec)
 EXPORT_SYMBOL_GPL(cros_ec_get_sensor_count);
 
 /**
- * cros_ec_command - Send a command to the EC.
+ * cros_ec_cmd - Send a command to the EC.
  *
  * @ec_dev: EC device
  * @version: EC command version
@@ -954,13 +954,13 @@ EXPORT_SYMBOL_GPL(cros_ec_get_sensor_count);
  *
  * Return: >= 0 on success, negative error number on failure.
  */
-int cros_ec_command(struct cros_ec_device *ec_dev,
-		    unsigned int version,
-		    int command,
-		    void *outdata,
-		    int outsize,
-		    void *indata,
-		    int insize)
+int cros_ec_cmd(struct cros_ec_device *ec_dev,
+		unsigned int version,
+		int command,
+		void *outdata,
+		int outsize,
+		void *indata,
+		int insize)
 {
 	struct cros_ec_command *msg;
 	int ret;
@@ -987,4 +987,4 @@ error:
 	kfree(msg);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(cros_ec_command);
+EXPORT_SYMBOL_GPL(cros_ec_cmd);
diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
index 7cb2e35c4ded..d6088ba447af 100644
--- a/drivers/platform/chrome/cros_ec_typec.c
+++ b/drivers/platform/chrome/cros_ec_typec.c
@@ -525,8 +525,8 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num,
 	enum typec_orientation orientation;
 	int ret;
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_MUX_INFO,
-			      &req, sizeof(req), &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_MUX_INFO,
+			  &req, sizeof(req), &resp, sizeof(resp));
 	if (ret < 0) {
 		dev_warn(typec->dev, "Failed to get mux info for port: %d, err = %d\n",
 			 port_num, ret);
@@ -585,8 +585,8 @@ mux_ack:
 	/* Sending Acknowledgment to EC */
 	mux_ack.port = port_num;
 
-	if (cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_MUX_ACK, &mux_ack,
-			    sizeof(mux_ack), NULL, 0) < 0)
+	if (cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_MUX_ACK, &mux_ack,
+			sizeof(mux_ack), NULL, 0) < 0)
 		dev_warn(typec->dev,
 			 "Failed to send Mux ACK to EC for port: %d\n",
 			 port_num);
@@ -754,8 +754,8 @@ static int cros_typec_handle_sop_prime_disc(struct cros_typec_data *typec, int p
 	int ret = 0;
 
 	memset(disc, 0, EC_PROTO2_MAX_RESPONSE_SIZE);
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
-			      disc, EC_PROTO2_MAX_RESPONSE_SIZE);
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
+			  disc, EC_PROTO2_MAX_RESPONSE_SIZE);
 	if (ret < 0) {
 		dev_err(typec->dev, "Failed to get SOP' discovery data for port: %d\n", port_num);
 		goto sop_prime_disc_exit;
@@ -837,8 +837,8 @@ static int cros_typec_handle_sop_disc(struct cros_typec_data *typec, int port_nu
 	typec_partner_set_pd_revision(port->partner, pd_revision);
 
 	memset(sop_disc, 0, EC_PROTO2_MAX_RESPONSE_SIZE);
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
-			      sop_disc, EC_PROTO2_MAX_RESPONSE_SIZE);
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req),
+			  sop_disc, EC_PROTO2_MAX_RESPONSE_SIZE);
 	if (ret < 0) {
 		dev_err(typec->dev, "Failed to get SOP discovery data for port: %d\n", port_num);
 		goto disc_exit;
@@ -870,8 +870,8 @@ static int cros_typec_send_clear_event(struct cros_typec_data *typec, int port_n
 		.clear_events_mask = events_mask,
 	};
 
-	return cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_CONTROL, &req,
-			       sizeof(req), NULL, 0);
+	return cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_CONTROL, &req,
+			   sizeof(req), NULL, 0);
 }
 
 static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num)
@@ -882,8 +882,8 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num
 	};
 	int ret;
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_TYPEC_STATUS, &req, sizeof(req),
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_TYPEC_STATUS, &req, sizeof(req),
+			  &resp, sizeof(resp));
 	if (ret < 0) {
 		dev_warn(typec->dev, "EC_CMD_TYPEC_STATUS failed for port: %d\n", port_num);
 		return;
@@ -960,9 +960,9 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
 	req.mux = USB_PD_CTRL_MUX_NO_CHANGE;
 	req.swap = USB_PD_CTRL_SWAP_NONE;
 
-	ret = cros_ec_command(typec->ec, typec->pd_ctrl_ver,
-			      EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, typec->pd_ctrl_ver,
+			  EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
+			  &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
@@ -997,9 +997,8 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
 
 	/* We're interested in the PD control command version. */
 	req_v1.cmd = EC_CMD_USB_PD_CONTROL;
-	ret = cros_ec_command(typec->ec, 1, EC_CMD_GET_CMD_VERSIONS,
-			      &req_v1, sizeof(req_v1), &resp,
-				    sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 1, EC_CMD_GET_CMD_VERSIONS,
+			  &req_v1, sizeof(req_v1), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
@@ -1090,8 +1089,8 @@ static int cros_typec_probe(struct platform_device *pdev)
 	typec->typec_cmd_supported = cros_ec_check_features(ec_dev, EC_FEATURE_TYPEC_CMD);
 	typec->needs_mux_ack = cros_ec_check_features(ec_dev, EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK);
 
-	ret = cros_ec_command(typec->ec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
-			      &resp, sizeof(resp));
+	ret = cros_ec_cmd(typec->ec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
+			  &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c
index 91ce6be91aac..4b5a81c9dc6d 100644
--- a/drivers/platform/chrome/cros_usbpd_notify.c
+++ b/drivers/platform/chrome/cros_usbpd_notify.c
@@ -71,8 +71,8 @@ static void cros_usbpd_get_event_and_notify(struct device  *dev,
 	}
 
 	/* Check for PD host events on EC. */
-	ret = cros_ec_command(ec_dev, 0, EC_CMD_PD_HOST_EVENT_STATUS,
-			      NULL, 0, &host_event_status, sizeof(host_event_status));
+	ret = cros_ec_cmd(ec_dev, 0, EC_CMD_PD_HOST_EVENT_STATUS,
+			  NULL, 0, &host_event_status, sizeof(host_event_status));
 	if (ret < 0) {
 		dev_warn(dev, "Can't get host event status (err: %d)\n", ret);
 		goto send_notify;
diff --git a/drivers/regulator/cros-ec-regulator.c b/drivers/regulator/cros-ec-regulator.c
index 1c5fc74a4776..1591636f86c3 100644
--- a/drivers/regulator/cros-ec-regulator.c
+++ b/drivers/regulator/cros-ec-regulator.c
@@ -30,8 +30,8 @@ static int cros_ec_regulator_enable(struct regulator_dev *dev)
 		.enable = 1,
 	};
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static int cros_ec_regulator_disable(struct regulator_dev *dev)
@@ -42,8 +42,8 @@ static int cros_ec_regulator_disable(struct regulator_dev *dev)
 		.enable = 0,
 	};
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_ENABLE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static int cros_ec_regulator_is_enabled(struct regulator_dev *dev)
@@ -55,8 +55,8 @@ static int cros_ec_regulator_is_enabled(struct regulator_dev *dev)
 	struct ec_response_regulator_is_enabled resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_IS_ENABLED, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_IS_ENABLED, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 	return resp.enabled;
@@ -82,8 +82,8 @@ static int cros_ec_regulator_get_voltage(struct regulator_dev *dev)
 	struct ec_response_regulator_get_voltage resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_GET_VOLTAGE, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_GET_VOLTAGE, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 	return resp.voltage_mv * 1000;
@@ -108,8 +108,8 @@ static int cros_ec_regulator_set_voltage(struct regulator_dev *dev, int min_uV,
 	if (min_mV > max_mV)
 		return -EINVAL;
 
-	return cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_SET_VOLTAGE, &cmd,
-			       sizeof(cmd), NULL, 0);
+	return cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_SET_VOLTAGE, &cmd,
+			   sizeof(cmd), NULL, 0);
 }
 
 static const struct regulator_ops cros_ec_regulator_voltage_ops = {
@@ -130,8 +130,8 @@ static int cros_ec_regulator_init_info(struct device *dev,
 	struct ec_response_regulator_get_info resp;
 	int ret;
 
-	ret = cros_ec_command(data->ec_dev, 0, EC_CMD_REGULATOR_GET_INFO, &cmd,
-			      sizeof(cmd), &resp, sizeof(resp));
+	ret = cros_ec_cmd(data->ec_dev, 0, EC_CMD_REGULATOR_GET_INFO, &cmd,
+			  sizeof(cmd), &resp, sizeof(resp));
 	if (ret < 0)
 		return ret;
 
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 138fd912c808..816da4eef3e5 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -231,7 +231,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
-int cros_ec_command(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
+int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
 		    int outsize, void *indata, int insize);
 
 /**
-- 
cgit v1.2.3


From f87e15fbf6d8cdb51f953338d41a4a52ad1aca14 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Mon, 6 Jun 2022 20:18:05 +0000
Subject: platform/chrome: cros_ec_proto: Update size arg types

cros_ec_cmd() takes 2 size arguments. Update them to be of the more
appropriate type size_t.

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Prashant Malani <pmalani@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220606201825.763788-4-pmalani@chromium.org
---
 drivers/platform/chrome/cros_ec_proto.c     | 4 ++--
 include/linux/platform_data/cros_ec_proto.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index b6bea183ee28..cefabfe45551 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -958,9 +958,9 @@ int cros_ec_cmd(struct cros_ec_device *ec_dev,
 		unsigned int version,
 		int command,
 		void *outdata,
-		int outsize,
+		size_t outsize,
 		void *indata,
-		int insize)
+		size_t insize)
 {
 	struct cros_ec_command *msg;
 	int ret;
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 816da4eef3e5..85e29300f63d 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -232,7 +232,7 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature);
 int cros_ec_get_sensor_count(struct cros_ec_dev *ec);
 
 int cros_ec_cmd(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata,
-		    int outsize, void *indata, int insize);
+		    size_t outsize, void *indata, size_t insize);
 
 /**
  * cros_ec_get_time_ns() - Return time in ns.
-- 
cgit v1.2.3


From 0c90466a7985d39355f743e9cd2139da3e86c4d8 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 3 Jun 2022 23:07:45 +0200
Subject: mtd: hyperbus: Make hyperbus_unregister_device() return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only thing that could theoretically fail in that function is
mtd_device_unregister(). However it's not supposed to fail and when
used correctly it doesn't. So wail loudly if it does anyhow.

This matches how other drivers (e.g. nand/raw/nandsim.c) use
mtd_device_unregister().

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220603210758.148493-2-u.kleine-koenig@pengutronix.de
---
 drivers/mtd/hyperbus/hbmc-am654.c    | 6 +++---
 drivers/mtd/hyperbus/hyperbus-core.c | 8 ++------
 drivers/mtd/hyperbus/rpc-if.c        | 5 +++--
 include/linux/mtd/hyperbus.h         | 4 +---
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/hyperbus/hbmc-am654.c b/drivers/mtd/hyperbus/hbmc-am654.c
index a3439b791eeb..a6161ce340d4 100644
--- a/drivers/mtd/hyperbus/hbmc-am654.c
+++ b/drivers/mtd/hyperbus/hbmc-am654.c
@@ -233,16 +233,16 @@ static int am654_hbmc_remove(struct platform_device *pdev)
 {
 	struct am654_hbmc_priv *priv = platform_get_drvdata(pdev);
 	struct am654_hbmc_device_priv *dev_priv = priv->hbdev.priv;
-	int ret;
 
-	ret = hyperbus_unregister_device(&priv->hbdev);
+	hyperbus_unregister_device(&priv->hbdev);
+
 	if (priv->mux_ctrl)
 		mux_control_deselect(priv->mux_ctrl);
 
 	if (dev_priv->rx_chan)
 		dma_release_channel(dev_priv->rx_chan);
 
-	return ret;
+	return 0;
 }
 
 static const struct of_device_id am654_hbmc_dt_ids[] = {
diff --git a/drivers/mtd/hyperbus/hyperbus-core.c b/drivers/mtd/hyperbus/hyperbus-core.c
index 2f9fc4e17d53..4d8047d43e48 100644
--- a/drivers/mtd/hyperbus/hyperbus-core.c
+++ b/drivers/mtd/hyperbus/hyperbus-core.c
@@ -126,16 +126,12 @@ int hyperbus_register_device(struct hyperbus_device *hbdev)
 }
 EXPORT_SYMBOL_GPL(hyperbus_register_device);
 
-int hyperbus_unregister_device(struct hyperbus_device *hbdev)
+void hyperbus_unregister_device(struct hyperbus_device *hbdev)
 {
-	int ret = 0;
-
 	if (hbdev && hbdev->mtd) {
-		ret = mtd_device_unregister(hbdev->mtd);
+		WARN_ON(mtd_device_unregister(hbdev->mtd));
 		map_destroy(hbdev->mtd);
 	}
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(hyperbus_unregister_device);
 
diff --git a/drivers/mtd/hyperbus/rpc-if.c b/drivers/mtd/hyperbus/rpc-if.c
index 6e08ec1d4f09..15a0be63ede1 100644
--- a/drivers/mtd/hyperbus/rpc-if.c
+++ b/drivers/mtd/hyperbus/rpc-if.c
@@ -153,11 +153,12 @@ static int rpcif_hb_probe(struct platform_device *pdev)
 static int rpcif_hb_remove(struct platform_device *pdev)
 {
 	struct rpcif_hyperbus *hyperbus = platform_get_drvdata(pdev);
-	int error = hyperbus_unregister_device(&hyperbus->hbdev);
+
+	hyperbus_unregister_device(&hyperbus->hbdev);
 
 	rpcif_disable_rpm(&hyperbus->rpc);
 
-	return error;
+	return 0;
 }
 
 static struct platform_driver rpcif_platform_driver = {
diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h
index 0ce612428aea..bb6b7121a542 100644
--- a/include/linux/mtd/hyperbus.h
+++ b/include/linux/mtd/hyperbus.h
@@ -89,9 +89,7 @@ int hyperbus_register_device(struct hyperbus_device *hbdev);
 /**
  * hyperbus_unregister_device - deregister HyperBus slave memory device
  * @hbdev: hyperbus_device to be unregistered
- *
- * Return: 0 for success, others for failure.
  */
-int hyperbus_unregister_device(struct hyperbus_device *hbdev);
+void hyperbus_unregister_device(struct hyperbus_device *hbdev);
 
 #endif /* __LINUX_MTD_HYPERBUS_H__ */
-- 
cgit v1.2.3


From ea7f0f777d28db6e500a05836f2a9d467c7012de Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Thu, 9 Jun 2022 08:49:37 +0000
Subject: platform/chrome: cros_ec_commands: fix compile errors

Fix compile errors when including cros_ec_commands.h solely.

1.
cros_ec_commands.h:587:9: error: unknown type name 'uint8_t'
  587 |         uint8_t flags;
      |         ^~~~~~~

2.
cros_ec_commands.h:1105:43: error: implicit declaration of function 'BIT'
 1105 |         EC_COMMS_STATUS_PROCESSING      = BIT(0),
      |                                           ^~~

Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220609084957.3684698-2-tzungbi@kernel.org
---
 include/linux/platform_data/cros_ec_commands.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index e59f51c41a1c..f13568b3e247 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -13,8 +13,8 @@
 #ifndef __CROS_EC_COMMANDS_H
 #define __CROS_EC_COMMANDS_H
 
-
-
+#include <linux/bits.h>
+#include <linux/types.h>
 
 #define BUILD_ASSERT(_cond)
 
-- 
cgit v1.2.3


From 3db0c9e5de7bd9dbe52580eb9752b2b3049e38da Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Thu, 9 Jun 2022 08:49:39 +0000
Subject: platform/chrome: use macros for passthru indexes

Move passthru indexes for EC and PD devices to common header.  Also use
them instead of literal constants.

Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://lore.kernel.org/r/20220609084957.3684698-4-tzungbi@kernel.org
---
 drivers/platform/chrome/cros_ec.c            |  3 ---
 drivers/platform/chrome/cros_ec_proto.c      |  6 +++---
 drivers/platform/chrome/cros_ec_proto_test.c | 15 ++++++++++-----
 drivers/platform/chrome/cros_ec_trace.h      |  8 ++++----
 include/linux/platform_data/cros_ec_proto.h  |  3 +++
 5 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c
index b3e94cdf7d1a..e51a3f2176c7 100644
--- a/drivers/platform/chrome/cros_ec.c
+++ b/drivers/platform/chrome/cros_ec.c
@@ -19,9 +19,6 @@
 
 #include "cros_ec.h"
 
-#define CROS_EC_DEV_EC_INDEX 0
-#define CROS_EC_DEV_PD_INDEX 1
-
 static struct cros_ec_platform ec_p = {
 	.ec_name = CROS_EC_DEV_NAME,
 	.cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_EC_INDEX),
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index cefabfe45551..cfa3dacce4e5 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -433,7 +433,7 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 
 	/* First try sending with proto v3. */
 	ec_dev->proto_version = 3;
-	ret = cros_ec_host_command_proto_query(ec_dev, 0, proto_msg);
+	ret = cros_ec_host_command_proto_query(ec_dev, CROS_EC_DEV_EC_INDEX, proto_msg);
 
 	if (ret == 0) {
 		proto_info = (struct ec_response_get_protocol_info *)
@@ -459,7 +459,7 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 		/*
 		 * Check for PD
 		 */
-		ret = cros_ec_host_command_proto_query(ec_dev, 1, proto_msg);
+		ret = cros_ec_host_command_proto_query(ec_dev, CROS_EC_DEV_PD_INDEX, proto_msg);
 
 		if (ret) {
 			dev_dbg(ec_dev->dev, "no PD chip found: %d\n", ret);
@@ -609,7 +609,7 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, struct cros_ec_command *msg)
 		msg->insize = ec_dev->max_response;
 	}
 
-	if (msg->command < EC_CMD_PASSTHRU_OFFSET(1)) {
+	if (msg->command < EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX)) {
 		if (msg->outsize > ec_dev->max_request) {
 			dev_err(ec_dev->dev,
 				"request of size %u is too big (max: %u)\n",
diff --git a/drivers/platform/chrome/cros_ec_proto_test.c b/drivers/platform/chrome/cros_ec_proto_test.c
index 675306c16d47..f8dbfb0d8dc8 100644
--- a/drivers/platform/chrome/cros_ec_proto_test.c
+++ b/drivers/platform/chrome/cros_ec_proto_test.c
@@ -281,7 +281,8 @@ static void cros_ec_proto_test_query_all_normal(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -396,7 +397,8 @@ static void cros_ec_proto_test_query_all_no_pd_return_error(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -685,7 +687,8 @@ static void cros_ec_proto_test_query_all_no_mkbp(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -783,7 +786,8 @@ static void cros_ec_proto_test_query_all_no_host_sleep(struct kunit *test)
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
@@ -889,7 +893,8 @@ static void cros_ec_proto_test_query_all_default_wake_mask_return_error(struct k
 
 		KUNIT_EXPECT_EQ(test, mock->msg.version, 0);
 		KUNIT_EXPECT_EQ(test, mock->msg.command,
-				EC_CMD_PASSTHRU_OFFSET(1) | EC_CMD_GET_PROTOCOL_INFO);
+				EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX) |
+				EC_CMD_GET_PROTOCOL_INFO);
 		KUNIT_EXPECT_EQ(test, mock->msg.insize,
 				sizeof(struct ec_response_get_protocol_info));
 		KUNIT_EXPECT_EQ(test, mock->msg.outsize, 0);
diff --git a/drivers/platform/chrome/cros_ec_trace.h b/drivers/platform/chrome/cros_ec_trace.h
index 9bb5cd2c98b8..d7e407de88df 100644
--- a/drivers/platform/chrome/cros_ec_trace.h
+++ b/drivers/platform/chrome/cros_ec_trace.h
@@ -30,8 +30,8 @@ TRACE_EVENT(cros_ec_request_start,
 	),
 	TP_fast_assign(
 		__entry->version = cmd->version;
-		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1);
-		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1);
+		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
+		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
 		__entry->outsize = cmd->outsize;
 		__entry->insize = cmd->insize;
 	),
@@ -55,8 +55,8 @@ TRACE_EVENT(cros_ec_request_done,
 	),
 	TP_fast_assign(
 		__entry->version = cmd->version;
-		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1);
-		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1);
+		__entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
+		__entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX);
 		__entry->outsize = cmd->outsize;
 		__entry->insize = cmd->insize;
 		__entry->result = cmd->result;
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 85e29300f63d..c82a8285d936 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -21,6 +21,9 @@
 #define CROS_EC_DEV_SCP_NAME	"cros_scp"
 #define CROS_EC_DEV_TP_NAME	"cros_tp"
 
+#define CROS_EC_DEV_EC_INDEX 0
+#define CROS_EC_DEV_PD_INDEX 1
+
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
  * simple delay to make sure that the bus stays locked.
-- 
cgit v1.2.3


From 4173f018aae16b6496d292c234b858241f85254f Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:12 +0200
Subject: tty/vt: consolemap: rename and document struct uni_pagedir

struct uni_pagedir contains 32 unicode page directories, so the name of
the structure is a bit misleading. Rename the structure to uni_pagedict,
so it looks like this:
struct uni_pagedict
  -> 32 page dirs
     -> 32 rows
       -> 64 glyphs

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-2-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c    | 47 +++++++++++++++++++++++++-----------------
 drivers/video/console/vgacon.c |  4 ++--
 include/linux/console_struct.h |  6 +++---
 3 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 839d75d1a6c0..5acafeea9afc 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -186,17 +186,26 @@ static unsigned short translations[][256] = {
 
 static int inv_translate[MAX_NR_CONSOLES];
 
-struct uni_pagedir {
-	u16 		**uni_pgdir[32];
+/**
+ * struct uni_pagedict -- unicode directory
+ *
+ * @uni_pgdir: 32*32*64 table with glyphs
+ * @refcount: reference count of this structure
+ * @sum: checksum
+ * @inverse_translations: best-effort inverse mapping
+ * @inverse_trans_unicode: best-effort inverse mapping to unicode
+ */
+struct uni_pagedict {
+	u16		**uni_pgdir[32];
 	unsigned long	refcount;
 	unsigned long	sum;
 	unsigned char	*inverse_translations[4];
 	u16		*inverse_trans_unicode;
 };
 
-static struct uni_pagedir *dflt;
+static struct uni_pagedict *dflt;
 
-static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int i)
+static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p, int i)
 {
 	int j, glyph;
 	unsigned short *t = translations[i];
@@ -221,7 +230,7 @@ static void set_inverse_transl(struct vc_data *conp, struct uni_pagedir *p, int
 }
 
 static void set_inverse_trans_unicode(struct vc_data *conp,
-				      struct uni_pagedir *p)
+				      struct uni_pagedict *p)
 {
 	int i, j, k, glyph;
 	u16 **p1, *p2;
@@ -270,7 +279,7 @@ unsigned short *set_translate(int m, struct vc_data *vc)
  */
 u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 	int m;
 	if (glyph < 0 || glyph >= MAX_GLYPH)
 		return 0;
@@ -297,7 +306,7 @@ EXPORT_SYMBOL_GPL(inverse_translate);
 static void update_user_maps(void)
 {
 	int i;
-	struct uni_pagedir *p, *q = NULL;
+	struct uni_pagedict *p, *q = NULL;
 	
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
@@ -393,7 +402,7 @@ int con_get_trans_new(ushort __user * arg)
 extern u8 dfont_unicount[];	/* Defined in console_defmap.c */
 extern u16 dfont_unitable[];
 
-static void con_release_unimap(struct uni_pagedir *p)
+static void con_release_unimap(struct uni_pagedict *p)
 {
 	u16 **p1;
 	int i, j;
@@ -419,7 +428,7 @@ static void con_release_unimap(struct uni_pagedir *p)
 /* Caller must hold the console lock */
 void con_free_unimap(struct vc_data *vc)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	p = *vc->vc_uni_pagedir_loc;
 	if (!p)
@@ -431,10 +440,10 @@ void con_free_unimap(struct vc_data *vc)
 	kfree(p);
 }
   
-static int con_unify_unimap(struct vc_data *conp, struct uni_pagedir *p)
+static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *p)
 {
 	int i, j, k;
-	struct uni_pagedir *q;
+	struct uni_pagedict *q;
 	
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
@@ -472,7 +481,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedir *p)
 }
 
 static int
-con_insert_unipair(struct uni_pagedir *p, u_short unicode, u_short fontpos)
+con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos)
 {
 	int i, n;
 	u16 **p1, *p2;
@@ -503,7 +512,7 @@ con_insert_unipair(struct uni_pagedir *p, u_short unicode, u_short fontpos)
 /* Caller must hold the lock */
 static int con_do_clear_unimap(struct vc_data *vc)
 {
-	struct uni_pagedir *p, *q;
+	struct uni_pagedict *p, *q;
 
 	p = *vc->vc_uni_pagedir_loc;
 	if (!p || --p->refcount) {
@@ -536,7 +545,7 @@ int con_clear_unimap(struct vc_data *vc)
 int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 {
 	int err = 0, err1, i;
-	struct uni_pagedir *p, *q;
+	struct uni_pagedict *p, *q;
 	struct unipair *unilist, *plist;
 
 	if (!ct)
@@ -569,7 +578,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 		
 		/*
 		 * Since refcount was > 1, con_clear_unimap() allocated a
-		 * a new uni_pagedir for this vc.  Re: p != q
+		 * a new uni_pagedict for this vc.  Re: p != q
 		 */
 		q = *vc->vc_uni_pagedir_loc;
 
@@ -660,7 +669,7 @@ int con_set_default_unimap(struct vc_data *vc)
 {
 	int i, j, err = 0, err1;
 	u16 *q;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	if (dflt) {
 		p = *vc->vc_uni_pagedir_loc;
@@ -714,7 +723,7 @@ EXPORT_SYMBOL(con_set_default_unimap);
  */
 int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
 {
-	struct uni_pagedir *q;
+	struct uni_pagedict *q;
 
 	if (!*src_vc->vc_uni_pagedir_loc)
 		return -EINVAL;
@@ -739,7 +748,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct uni
 	int i, j, k, ret = 0;
 	ushort ect;
 	u16 **p1, *p2;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 	struct unipair *unilist;
 
 	unilist = kvmalloc_array(ct, sizeof(struct unipair), GFP_KERNEL);
@@ -810,7 +819,7 @@ conv_uni_to_pc(struct vc_data *conp, long ucs)
 {
 	int h;
 	u16 **p1, *p2;
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
   
 	/* Only 16-bit codes supported at this time */
 	if (ucs > 0xffff)
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 576612f18d59..058a78b8dbcf 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -75,7 +75,7 @@ static void vgacon_scrolldelta(struct vc_data *c, int lines);
 static int vgacon_set_origin(struct vc_data *c);
 static void vgacon_save_screen(struct vc_data *c);
 static void vgacon_invert_region(struct vc_data *c, u16 * p, int count);
-static struct uni_pagedir *vgacon_uni_pagedir;
+static struct uni_pagedict *vgacon_uni_pagedir;
 static int vgacon_refcount;
 
 /* Description of the hardware situation */
@@ -342,7 +342,7 @@ static const char *vgacon_startup(void)
 
 static void vgacon_init(struct vc_data *c, int init)
 {
-	struct uni_pagedir *p;
+	struct uni_pagedict *p;
 
 	/*
 	 * We cannot be loaded as a module, therefore init will be 1
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index d5b9c8d40c18..f75033f0277f 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -17,7 +17,7 @@
 #include <linux/vt.h>
 #include <linux/workqueue.h>
 
-struct uni_pagedir;
+struct uni_pagedict;
 struct uni_screen;
 
 #define NPAR 16
@@ -157,8 +157,8 @@ struct vc_data {
 	unsigned int	vc_bell_duration;	/* Console bell duration */
 	unsigned short	vc_cur_blink_ms;	/* Cursor blink duration */
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
-	struct uni_pagedir *vc_uni_pagedir;
-	struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */
+	struct uni_pagedict *vc_uni_pagedir;
+	struct uni_pagedict **vc_uni_pagedir_loc; /* [!] Location of uni_pagedict variable for this console */
 	struct uni_screen *vc_uni_screen;	/* unicode screen content */
 	/* additional information is in vt_kern.h */
 };
-- 
cgit v1.2.3


From 0b75f7968d61566d150747512344cabaa59e54c6 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:15 +0200
Subject: tty/vt: consolemap: remove extern from function decls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extern keyword is not needed for function declarations. Remove it,
so that the consolemap header conforms to other tty headers.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/consolemap.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index bcfce748c9d8..abc28e4450bf 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -17,12 +17,11 @@
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
 struct vc_data;
 
-extern u16 inverse_translate(const struct vc_data *conp, int glyph,
-		int use_unicode);
-extern unsigned short *set_translate(int m, struct vc_data *vc);
-extern int conv_uni_to_pc(struct vc_data *conp, long ucs);
-extern u32 conv_8bit_to_uni(unsigned char c);
-extern int conv_uni_to_8bit(u32 uni);
+u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
+unsigned short *set_translate(int m, struct vc_data *vc);
+int conv_uni_to_pc(struct vc_data *conp, long ucs);
+u32 conv_8bit_to_uni(unsigned char c);
+int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
 #define inverse_translate(conp, glyph, uni) ((uint16_t)glyph)
-- 
cgit v1.2.3


From f827c754f9b6247f4d4f9cbb508b22bff2cf8e6d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:16 +0200
Subject: tty/vt: consolemap: convert macros to static inlines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit changes !CONFIG_CONSOLE_TRANSLATIONS definitions to real
(inline) functions. So the commit:
* makes type checking much stronger,
* removes the need of many parentheses and casts, and
* makes the code more readable.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-6-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/consolemap.h | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index abc28e4450bf..98171dbed51f 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -14,9 +14,9 @@
 
 #include <linux/types.h>
 
-#ifdef CONFIG_CONSOLE_TRANSLATIONS
 struct vc_data;
 
+#ifdef CONFIG_CONSOLE_TRANSLATIONS
 u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
 unsigned short *set_translate(int m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
@@ -24,12 +24,33 @@ u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
-#define inverse_translate(conp, glyph, uni) ((uint16_t)glyph)
-#define set_translate(m, vc) ((unsigned short *)NULL)
-#define conv_uni_to_pc(conp, ucs) ((int) (ucs > 0xff ? -1: ucs))
-#define conv_8bit_to_uni(c) ((uint32_t)(c))
-#define conv_uni_to_8bit(c) ((int) ((c) & 0xff))
-#define console_map_init(c) do { ; } while (0)
+static inline u16 inverse_translate(const struct vc_data *conp, int glyph,
+		int use_unicode)
+{
+	return glyph;
+}
+
+static inline unsigned short *set_translate(int m, struct vc_data *vc)
+{
+	return NULL;
+}
+
+static inline int conv_uni_to_pc(struct vc_data *conp, long ucs)
+{
+	return ucs > 0xff ? -1 : ucs;
+}
+
+static inline u32 conv_8bit_to_uni(unsigned char c)
+{
+	return c;
+}
+
+static inline int conv_uni_to_8bit(u32 uni)
+{
+	return uni & 0xff;
+}
+
+static inline void console_map_init(void) { }
 #endif /* CONFIG_CONSOLE_TRANSLATIONS */
 
 #endif /* __LINUX_CONSOLEMAP_H__ */
-- 
cgit v1.2.3


From d9ebb906a45adc71a62aceb1e3608c847cafa660 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:17 +0200
Subject: tty/vt: consolemap: make parameters of inverse_translate() saner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- int use_unicode -> bool: it's used as bool at some places already, so
  make it explicit.
- int glyph -> u16: every caller passes a u16 in. So make it explicit
  too. And remove a negative check from inverse_translate() as it never
  could be negative.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-7-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/accessibility/braille/braille_console.c | 2 +-
 drivers/accessibility/speakup/main.c            | 2 +-
 drivers/tty/vt/consolemap.c                     | 4 ++--
 drivers/tty/vt/selection.c                      | 3 ++-
 drivers/tty/vt/vt.c                             | 2 +-
 include/linux/consolemap.h                      | 6 +++---
 6 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/accessibility/braille/braille_console.c b/drivers/accessibility/braille/braille_console.c
index fdc6b593f500..c4d54a5326b1 100644
--- a/drivers/accessibility/braille/braille_console.c
+++ b/drivers/accessibility/braille/braille_console.c
@@ -131,7 +131,7 @@ static void vc_refresh(struct vc_data *vc)
 	for (i = 0; i < WIDTH; i++) {
 		u16 glyph = screen_glyph(vc,
 				2 * (vc_x + i) + vc_y * vc->vc_size_row);
-		buf[i] = inverse_translate(vc, glyph, 1);
+		buf[i] = inverse_translate(vc, glyph, true);
 	}
 	braille_write(buf);
 }
diff --git a/drivers/accessibility/speakup/main.c b/drivers/accessibility/speakup/main.c
index d726537fa16c..f52265293482 100644
--- a/drivers/accessibility/speakup/main.c
+++ b/drivers/accessibility/speakup/main.c
@@ -470,7 +470,7 @@ static u16 get_char(struct vc_data *vc, u16 *pos, u_char *attribs)
 			c |= 0x100;
 		}
 
-		ch = inverse_translate(vc, c, 1);
+		ch = inverse_translate(vc, c, true);
 		*attribs = (w & 0xff00) >> 8;
 	}
 	return ch;
diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index fb61158f4dc6..157c7f936294 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -281,12 +281,12 @@ unsigned short *set_translate(int m, struct vc_data *vc)
  *    was active.
  * Still, it is now possible to a certain extent to cut and paste non-ASCII.
  */
-u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode)
+u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 {
 	struct uni_pagedict *p;
 	int m;
 
-	if (glyph < 0 || glyph >= MAX_GLYPH)
+	if (glyph >= MAX_GLYPH)
 		return 0;
 
 	p = *conp->vc_uni_pagedir_loc;
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index f7755e73696e..6ef22f01cc51 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -68,7 +68,8 @@ sel_pos(int n, bool unicode)
 {
 	if (unicode)
 		return screen_glyph_unicode(vc_sel.cons, n / 2);
-	return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n), 0);
+	return inverse_translate(vc_sel.cons, screen_glyph(vc_sel.cons, n),
+			false);
 }
 
 /**
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index f8c87c4d7399..1ea1c11c42fd 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4741,7 +4741,7 @@ u32 screen_glyph_unicode(const struct vc_data *vc, int n)
 
 	if (uniscr)
 		return uniscr->lines[n / vc->vc_cols][n % vc->vc_cols];
-	return inverse_translate(vc, screen_glyph(vc, n * 2), 1);
+	return inverse_translate(vc, screen_glyph(vc, n * 2), true);
 }
 EXPORT_SYMBOL_GPL(screen_glyph_unicode);
 
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index 98171dbed51f..1ff2bf55eb85 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -17,15 +17,15 @@
 struct vc_data;
 
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
-u16 inverse_translate(const struct vc_data *conp, int glyph, int use_unicode);
+u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode);
 unsigned short *set_translate(int m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
 u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 #else
-static inline u16 inverse_translate(const struct vc_data *conp, int glyph,
-		int use_unicode)
+static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
+		bool use_unicode)
 {
 	return glyph;
 }
-- 
cgit v1.2.3


From 5a904a936b407624cd1ff5ee3f1675ca3d2366a5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 7 Jun 2022 12:49:27 +0200
Subject: tty/vt: consolemap: introduce enum translation_map and use it

Again, instead of magic constants in the code, declare an enum and be a
little bit more explicit. Both in the translations definition and in the
loops etc.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220607104946.18710-17-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c | 39 ++++++++++++++++++++-------------------
 include/linux/consolemap.h  | 18 ++++++++++++------
 2 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 92b5dddb00d9..80536687acef 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -38,7 +38,7 @@
 
 static unsigned short translations[][256] = {
   /* 8-bit Latin-1 mapped to Unicode -- trivial mapping */
-  {
+  [LAT1_MAP] = {
     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -71,9 +71,9 @@ static unsigned short translations[][256] = {
     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
-  }, 
+  },
   /* VT100 graphics mapped to Unicode */
-  {
+  [GRAF_MAP] = {
     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
@@ -108,8 +108,8 @@ static unsigned short translations[][256] = {
     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
   },
   /* IBM Codepage 437 mapped to Unicode */
-  {
-    0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 
+  [IBMPC_MAP] = {
+    0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
     0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
     0x25b6, 0x25c0, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8,
     0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc,
@@ -141,9 +141,9 @@ static unsigned short translations[][256] = {
     0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
     0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,
     0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
-  }, 
+  },
   /* User mapping -- default to codes for direct font mapping */
-  {
+  [USER_MAP] = {
     0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007,
     0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f,
     0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017,
@@ -184,7 +184,7 @@ static unsigned short translations[][256] = {
 
 #define MAX_GLYPH 512		/* Max possible glyph value */
 
-static int inv_translate[MAX_NR_CONSOLES];
+static enum translation_map inv_translate[MAX_NR_CONSOLES];
 
 #define UNI_DIRS	32U
 #define UNI_DIR_ROWS	32U
@@ -208,24 +208,25 @@ struct uni_pagedict {
 	u16		**uni_pgdir[UNI_DIRS];
 	unsigned long	refcount;
 	unsigned long	sum;
-	unsigned char	*inverse_translations[4];
+	unsigned char	*inverse_translations[LAST_MAP + 1];
 	u16		*inverse_trans_unicode;
 };
 
 static struct uni_pagedict *dflt;
 
-static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p, int i)
+static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *p,
+	       enum translation_map m)
 {
 	int j, glyph;
-	unsigned short *t = translations[i];
+	unsigned short *t = translations[m];
 	unsigned char *q;
 	
 	if (!p)
 		return;
-	q = p->inverse_translations[i];
+	q = p->inverse_translations[m];
 
 	if (!q) {
-		q = p->inverse_translations[i] = kmalloc(MAX_GLYPH, GFP_KERNEL);
+		q = p->inverse_translations[m] = kmalloc(MAX_GLYPH, GFP_KERNEL);
 		if (!q)
 			return;
 	}
@@ -276,7 +277,7 @@ static void set_inverse_trans_unicode(struct vc_data *conp,
 	}
 }
 
-unsigned short *set_translate(int m, struct vc_data *vc)
+unsigned short *set_translate(enum translation_map m, struct vc_data *vc)
 {
 	inv_translate[vc->vc_num] = m;
 	return translations[m];
@@ -292,7 +293,7 @@ unsigned short *set_translate(int m, struct vc_data *vc)
 u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 {
 	struct uni_pagedict *p;
-	int m;
+	enum translation_map m;
 
 	if (glyph >= MAX_GLYPH)
 		return 0;
@@ -669,8 +670,8 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 	if (con_unify_unimap(vc, p))
 		goto out_unlock;
 
-	for (i = 0; i <= 3; i++)
-		set_inverse_transl(vc, p, i); /* Update inverse translations */
+	for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++)
+		set_inverse_transl(vc, p, m); /* Update inverse translations */
 	set_inverse_trans_unicode(vc, p);
 
 out_unlock:
@@ -731,8 +732,8 @@ int con_set_default_unimap(struct vc_data *vc)
 		return err;
 	}
 
-	for (i = 0; i <= 3; i++)
-		set_inverse_transl(vc, p, i);	/* Update all inverse translations */
+	for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++)
+		set_inverse_transl(vc, p, m);	/* Update all inverse translations */
 	set_inverse_trans_unicode(vc, p);
 	dflt = p;
 	return err;
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index 1ff2bf55eb85..c35db4896c37 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -7,10 +7,15 @@
 #ifndef __LINUX_CONSOLEMAP_H__
 #define __LINUX_CONSOLEMAP_H__
 
-#define LAT1_MAP 0
-#define GRAF_MAP 1
-#define IBMPC_MAP 2
-#define USER_MAP 3
+enum translation_map {
+	LAT1_MAP,
+	GRAF_MAP,
+	IBMPC_MAP,
+	USER_MAP,
+
+	FIRST_MAP = LAT1_MAP,
+	LAST_MAP = USER_MAP,
+};
 
 #include <linux/types.h>
 
@@ -18,7 +23,7 @@ struct vc_data;
 
 #ifdef CONFIG_CONSOLE_TRANSLATIONS
 u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode);
-unsigned short *set_translate(int m, struct vc_data *vc);
+unsigned short *set_translate(enum translation_map m, struct vc_data *vc);
 int conv_uni_to_pc(struct vc_data *conp, long ucs);
 u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
@@ -30,7 +35,8 @@ static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
 	return glyph;
 }
 
-static inline unsigned short *set_translate(int m, struct vc_data *vc)
+static inline unsigned short *set_translate(enum translation_map m,
+		struct vc_data *vc)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 8322b1f527159de578aab277629296575a11eb3c Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 13:03:58 +0300
Subject: serial: Add uart_rs485_config()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few serial drivers make a call to rs485_config() themselves (all
these seem to relate to init). Convert them all to use a common helper
which makes it easy to make adjustments on tasks related to it as
serial_rs485 struct sanitization is going to be added.

In pci_fintek_setup() (in 8250_pci.c), the rs485_config() call was made
with NULL, however, it can be changed to pass uart_port's rs485 struct.
No other callers should pass NULL into rs485_config() so the NULL check
can now be eliminated.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606100433.13793-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_pci.c  | 6 ++----
 drivers/tty/serial/8250/8250_port.c | 2 +-
 drivers/tty/serial/fsl_lpuart.c     | 2 +-
 drivers/tty/serial/imx.c            | 2 +-
 drivers/tty/serial/serial_core.c    | 6 ++++++
 include/linux/serial_core.h         | 1 +
 6 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index a17619db7939..fb0a49e39072 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1562,9 +1562,7 @@ static int pci_fintek_rs485_config(struct uart_port *port,
 
 	pci_read_config_byte(pci_dev, 0x40 + 8 * *index + 7, &setting);
 
-	if (!rs485)
-		rs485 = &port->rs485;
-	else if (rs485->flags & SER_RS485_ENABLED)
+	if (rs485->flags & SER_RS485_ENABLED)
 		memset(rs485->padding, 0, sizeof(rs485->padding));
 	else
 		memset(rs485, 0, sizeof(*rs485));
@@ -1689,7 +1687,7 @@ static int pci_fintek_init(struct pci_dev *dev)
 			 * pciserial_resume_ports()
 			 */
 			port = serial8250_get_port(priv->line[i]);
-			pci_fintek_rs485_config(&port->port, NULL);
+			uart_rs485_config(&port->port);
 		} else {
 			/* First init without port data
 			 * force init to RS232 Mode
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 30e0aaf52adc..af550a4a27f8 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -3191,7 +3191,7 @@ static void serial8250_config_port(struct uart_port *port, int flags)
 		autoconfig(up);
 
 	if (port->rs485.flags & SER_RS485_ENABLED)
-		port->rs485_config(port, &port->rs485);
+		uart_rs485_config(port);
 
 	/* if access method is AU, it is a 16550 with a quirk */
 	if (port->type == PORT_16550A && port->iotype == UPIO_AU)
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 0d6e62f6bb07..509a7912fa9d 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2724,7 +2724,7 @@ static int lpuart_probe(struct platform_device *pdev)
 	    sport->port.rs485.delay_rts_after_send)
 		dev_err(&pdev->dev, "driver doesn't support RTS delays\n");
 
-	sport->port.rs485_config(&sport->port, &sport->port.rs485);
+	uart_rs485_config(&sport->port);
 
 	ret = devm_request_irq(&pdev->dev, sport->port.irq, handler, 0,
 				DRIVER_NAME, sport);
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 30edb35a6a15..17fb9a57078b 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -2338,7 +2338,7 @@ static int imx_uart_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev,
 			"low-active RTS not possible when receiver is off, enabling receiver\n");
 
-	imx_uart_rs485_config(&sport->port, &sport->port.rs485);
+	uart_rs485_config(&sport->port);
 
 	/* Disable interrupts before requesting them */
 	ucr1 = imx_uart_readl(sport, UCR1);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 9a85b41caa0a..8466181db4e9 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1276,6 +1276,12 @@ static int uart_get_icount(struct tty_struct *tty,
 	return 0;
 }
 
+int uart_rs485_config(struct uart_port *port)
+{
+	return port->rs485_config(port, &port->rs485);
+}
+EXPORT_SYMBOL_GPL(uart_rs485_config);
+
 static int uart_get_rs485_config(struct uart_port *port,
 			 struct serial_rs485 __user *rs485)
 {
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index cbd5070bc87f..d3ebb4db2d80 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -592,4 +592,5 @@ static inline int uart_handle_break(struct uart_port *port)
 					 !((cflag) & CLOCAL))
 
 int uart_get_rs485_mode(struct uart_port *port);
+int uart_rs485_config(struct uart_port *port);
 #endif /* LINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From 8925c31c1ac2f1e05da988581f2a70a2a8c4d638 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 13:04:00 +0300
Subject: serial: Add rs485_supported to uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Preparing to move serial_rs485 struct sanitization into serial core,
each driver has to provide what fields/flags it supports. This
information is pointed into by rs485_supported.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606100433.13793-4-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 1 +
 include/linux/serial_core.h         | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index b0320de3379c..90ddc8924811 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1003,6 +1003,7 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 		uart->port.throttle	= up->port.throttle;
 		uart->port.unthrottle	= up->port.unthrottle;
 		uart->port.rs485_config	= up->port.rs485_config;
+		uart->port.rs485_supported = up->port.rs485_supported;
 		uart->port.rs485	= up->port.rs485;
 		uart->rs485_start_tx	= up->rs485_start_tx;
 		uart->rs485_stop_tx	= up->rs485_stop_tx;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d3ebb4db2d80..5518b70177b3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -254,6 +254,7 @@ struct uart_port {
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
+	const struct serial_rs485	*rs485_supported;	/* Supported mask for serial_rs485 */
 	struct gpio_desc	*rs485_term_gpio;	/* enable RS485 bus termination */
 	struct serial_iso7816   iso7816;
 	void			*private_data;		/* generic platform data pointer */
-- 
cgit v1.2.3


From 6bb6fa6908ebd3cb4e14cd4f0ce272ec885d2eb0 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 6 Jun 2022 18:36:51 +0300
Subject: tty: Implement lookahead to process XON/XOFF timely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When tty is not read from, XON/XOFF may get stuck into an
intermediate buffer. As those characters are there to do software
flow-control, it is not very useful. In the case where neither end
reads from ttys, the receiving ends might not be able receive the
XOFF characters and just keep sending more data to the opposite
direction. This problem is almost guaranteed to occur with DMA
which sends data in large chunks.

If TTY is slow to process characters, that is, eats less than given
amount in receive_buf, invoke lookahead for the rest of the chars
to process potential XON/XOFF characters.

We need to keep track of how many characters have been processed by the
lookahead to avoid processing the flow control char again on the normal
path. Bookkeeping occurs parallel on two layers (tty_buffer and n_tty)
to avoid passing the lookahead_count through the whole call chain.

When a flow-control char is processed, two things must occur:
  a) it must not be treated as normal char
  b) if not yet processed, flow-control actions need to be taken
The return value of n_tty_receive_char_flow_ctrl() tells caller a), and
b) is kept internal to n_tty_receive_char_flow_ctrl().

If characters were previous looked ahead, __receive_buf() makes two
calls to the appropriate n_tty_receive_buf_* function. First call is
made with lookahead_done=true for the characters that were subject to
lookahead earlier and then with lookahead=false for the new characters.
Either of the calls might be skipped when it has no characters to
handle.

Reported-by: Gilles Buloz <gilles.buloz@kontron.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220606153652.63554-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c        | 91 ++++++++++++++++++++++++++++++++++++++--------
 drivers/tty/tty_buffer.c   | 59 +++++++++++++++++++++++++-----
 drivers/tty/tty_port.c     | 21 +++++++++++
 include/linux/tty_buffer.h |  1 +
 include/linux/tty_ldisc.h  | 14 +++++++
 include/linux/tty_port.h   |  2 +
 6 files changed, 163 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 640c9e871044..8d80384df874 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -118,6 +118,9 @@ struct n_tty_data {
 	size_t read_tail;
 	size_t line_start;
 
+	/* # of chars looked ahead (to find software flow control chars) */
+	size_t lookahead_count;
+
 	/* protected by output lock */
 	unsigned int column;
 	unsigned int canon_column;
@@ -333,6 +336,8 @@ static void reset_buffer_flags(struct n_tty_data *ldata)
 	ldata->erasing = 0;
 	bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
 	ldata->push = 0;
+
+	ldata->lookahead_count = 0;
 }
 
 static void n_tty_packet_mode_flush(struct tty_struct *tty)
@@ -1225,12 +1230,30 @@ static bool n_tty_is_char_flow_ctrl(struct tty_struct *tty, unsigned char c)
 	return c == START_CHAR(tty) || c == STOP_CHAR(tty);
 }
 
-/* Returns true if c is consumed as flow-control character */
-static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c)
+/**
+ * n_tty_receive_char_flow_ctrl - receive flow control chars
+ * @tty: terminal device
+ * @c: character
+ * @lookahead_done: lookahead has processed this character already
+ *
+ * Receive and process flow control character actions.
+ *
+ * In case lookahead for flow control chars already handled the character in
+ * advance to the normal receive, the actions are skipped during normal
+ * receive.
+ *
+ * Returns true if @c is consumed as flow-control character, the character
+ * must not be treated as normal character.
+ */
+static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c,
+					 bool lookahead_done)
 {
 	if (!n_tty_is_char_flow_ctrl(tty, c))
 		return false;
 
+	if (lookahead_done)
+		return true;
+
 	if (c == START_CHAR(tty)) {
 		start_tty(tty);
 		process_echoes(tty);
@@ -1242,11 +1265,12 @@ static bool n_tty_receive_char_flow_ctrl(struct tty_struct *tty, unsigned char c
 	return true;
 }
 
-static void n_tty_receive_char_special(struct tty_struct *tty, unsigned char c)
+static void n_tty_receive_char_special(struct tty_struct *tty, unsigned char c,
+				       bool lookahead_done)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 
-	if (I_IXON(tty) && n_tty_receive_char_flow_ctrl(tty, c))
+	if (I_IXON(tty) && n_tty_receive_char_flow_ctrl(tty, c, lookahead_done))
 		return;
 
 	if (L_ISIG(tty)) {
@@ -1401,7 +1425,8 @@ static void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 	put_tty_queue(c, ldata);
 }
 
-static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
+static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c,
+				       bool lookahead_done)
 {
 	if (I_ISTRIP(tty))
 		c &= 0x7f;
@@ -1409,9 +1434,12 @@ static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
 		c = tolower(c);
 
 	if (I_IXON(tty)) {
-		if (c == STOP_CHAR(tty))
-			stop_tty(tty);
-		else if (c == START_CHAR(tty) ||
+		if (c == STOP_CHAR(tty)) {
+			if (!lookahead_done)
+				stop_tty(tty);
+		} else if (c == START_CHAR(tty) && lookahead_done) {
+			return;
+		} else if (c == START_CHAR(tty) ||
 			 (tty->flow.stopped && !tty->flow.tco_stopped && I_IXANY(tty) &&
 			  c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) &&
 			  c != SUSP_CHAR(tty))) {
@@ -1457,6 +1485,27 @@ n_tty_receive_char_lnext(struct tty_struct *tty, unsigned char c, char flag)
 		n_tty_receive_char_flagged(tty, c, flag);
 }
 
+/* Caller must ensure count > 0 */
+static void n_tty_lookahead_flow_ctrl(struct tty_struct *tty, const unsigned char *cp,
+				      const unsigned char *fp, unsigned int count)
+{
+	struct n_tty_data *ldata = tty->disc_data;
+	unsigned char flag = TTY_NORMAL;
+
+	ldata->lookahead_count += count;
+
+	if (!I_IXON(tty))
+		return;
+
+	while (count--) {
+		if (fp)
+			flag = *fp++;
+		if (likely(flag == TTY_NORMAL))
+			n_tty_receive_char_flow_ctrl(tty, *cp, false);
+		cp++;
+	}
+}
+
 static void
 n_tty_receive_buf_real_raw(struct tty_struct *tty, const unsigned char *cp,
 			   const char *fp, int count)
@@ -1496,7 +1545,7 @@ n_tty_receive_buf_raw(struct tty_struct *tty, const unsigned char *cp,
 
 static void
 n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
-			  const char *fp, int count)
+			  const char *fp, int count, bool lookahead_done)
 {
 	char flag = TTY_NORMAL;
 
@@ -1504,12 +1553,12 @@ n_tty_receive_buf_closing(struct tty_struct *tty, const unsigned char *cp,
 		if (fp)
 			flag = *fp++;
 		if (likely(flag == TTY_NORMAL))
-			n_tty_receive_char_closing(tty, *cp++);
+			n_tty_receive_char_closing(tty, *cp++, lookahead_done);
 	}
 }
 
 static void n_tty_receive_buf_standard(struct tty_struct *tty,
-		const unsigned char *cp, const char *fp, int count)
+		const unsigned char *cp, const char *fp, int count, bool lookahead_done)
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	char flag = TTY_NORMAL;
@@ -1540,7 +1589,7 @@ static void n_tty_receive_buf_standard(struct tty_struct *tty,
 		}
 
 		if (test_bit(c, ldata->char_map))
-			n_tty_receive_char_special(tty, c);
+			n_tty_receive_char_special(tty, c, lookahead_done);
 		else
 			n_tty_receive_char(tty, c);
 	}
@@ -1551,21 +1600,30 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
 {
 	struct n_tty_data *ldata = tty->disc_data;
 	bool preops = I_ISTRIP(tty) || (I_IUCLC(tty) && L_IEXTEN(tty));
+	size_t la_count = min_t(size_t, ldata->lookahead_count, count);
 
 	if (ldata->real_raw)
 		n_tty_receive_buf_real_raw(tty, cp, fp, count);
 	else if (ldata->raw || (L_EXTPROC(tty) && !preops))
 		n_tty_receive_buf_raw(tty, cp, fp, count);
-	else if (tty->closing && !L_EXTPROC(tty))
-		n_tty_receive_buf_closing(tty, cp, fp, count);
-	else {
-		n_tty_receive_buf_standard(tty, cp, fp, count);
+	else if (tty->closing && !L_EXTPROC(tty)) {
+		if (la_count > 0)
+			n_tty_receive_buf_closing(tty, cp, fp, la_count, true);
+		if (count > la_count)
+			n_tty_receive_buf_closing(tty, cp, fp, count - la_count, false);
+	} else {
+		if (la_count > 0)
+			n_tty_receive_buf_standard(tty, cp, fp, la_count, true);
+		if (count > la_count)
+			n_tty_receive_buf_standard(tty, cp, fp, count - la_count, false);
 
 		flush_echoes(tty);
 		if (tty->ops->flush_chars)
 			tty->ops->flush_chars(tty);
 	}
 
+	ldata->lookahead_count -= la_count;
+
 	if (ldata->icanon && !L_EXTPROC(tty))
 		return;
 
@@ -2446,6 +2504,7 @@ static struct tty_ldisc_ops n_tty_ops = {
 	.receive_buf     = n_tty_receive_buf,
 	.write_wakeup    = n_tty_write_wakeup,
 	.receive_buf2	 = n_tty_receive_buf2,
+	.lookahead_buf	 = n_tty_lookahead_flow_ctrl,
 };
 
 /**
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index bfa431a8e690..754fa43670cc 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -5,6 +5,7 @@
 
 #include <linux/types.h>
 #include <linux/errno.h>
+#include <linux/minmax.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_flip.h>
@@ -104,6 +105,7 @@ static void tty_buffer_reset(struct tty_buffer *p, size_t size)
 	p->size = size;
 	p->next = NULL;
 	p->commit = 0;
+	p->lookahead = 0;
 	p->read = 0;
 	p->flags = 0;
 }
@@ -234,6 +236,7 @@ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld)
 		buf->head = next;
 	}
 	buf->head->read = buf->head->commit;
+	buf->head->lookahead = buf->head->read;
 
 	if (ld && ld->ops->flush_buffer)
 		ld->ops->flush_buffer(tty);
@@ -276,13 +279,15 @@ static int __tty_buffer_request_room(struct tty_port *port, size_t size,
 		if (n != NULL) {
 			n->flags = flags;
 			buf->tail = n;
-			/* paired w/ acquire in flush_to_ldisc(); ensures
-			 * flush_to_ldisc() sees buffer data.
+			/*
+			 * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
+			 * ensures they see all buffer data.
 			 */
 			smp_store_release(&b->commit, b->used);
-			/* paired w/ acquire in flush_to_ldisc(); ensures the
-			 * latest commit value can be read before the head is
-			 * advanced to the next buffer
+			/*
+			 * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs()
+			 * ensures the latest commit value can be read before the head
+			 * is advanced to the next buffer.
 			 */
 			smp_store_release(&b->next, n);
 		} else if (change)
@@ -459,6 +464,40 @@ int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 }
 EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf);
 
+static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head)
+{
+	head->lookahead = max(head->lookahead, head->read);
+
+	while (head) {
+		struct tty_buffer *next;
+		unsigned char *p, *f = NULL;
+		unsigned int count;
+
+		/*
+		 * Paired w/ release in __tty_buffer_request_room();
+		 * ensures commit value read is not stale if the head
+		 * is advancing to the next buffer.
+		 */
+		next = smp_load_acquire(&head->next);
+		/*
+		 * Paired w/ release in __tty_buffer_request_room() or in
+		 * tty_buffer_flush(); ensures we see the committed buffer data.
+		 */
+		count = smp_load_acquire(&head->commit) - head->lookahead;
+		if (!count) {
+			head = next;
+			continue;
+		}
+
+		p = char_buf_ptr(head, head->lookahead);
+		if (~head->flags & TTYB_NORMAL)
+			f = flag_buf_ptr(head, head->lookahead);
+
+		port->client_ops->lookahead_buf(port, p, f, count);
+		head->lookahead += count;
+	}
+}
+
 static int
 receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
 {
@@ -496,7 +535,7 @@ static void flush_to_ldisc(struct work_struct *work)
 	while (1) {
 		struct tty_buffer *head = buf->head;
 		struct tty_buffer *next;
-		int count;
+		int count, rcvd;
 
 		/* Ldisc or user is trying to gain exclusive access */
 		if (atomic_read(&buf->priority))
@@ -519,10 +558,12 @@ static void flush_to_ldisc(struct work_struct *work)
 			continue;
 		}
 
-		count = receive_buf(port, head, count);
-		if (!count)
+		rcvd = receive_buf(port, head, count);
+		head->read += rcvd;
+		if (rcvd < count)
+			lookahead_bufs(port, head);
+		if (!rcvd)
 			break;
-		head->read += count;
 
 		if (need_resched())
 			cond_resched();
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 880608a65773..dce08a6d7b5e 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -43,6 +43,26 @@ static int tty_port_default_receive_buf(struct tty_port *port,
 	return ret;
 }
 
+static void tty_port_default_lookahead_buf(struct tty_port *port, const unsigned char *p,
+					   const unsigned char *f, unsigned int count)
+{
+	struct tty_struct *tty;
+	struct tty_ldisc *disc;
+
+	tty = READ_ONCE(port->itty);
+	if (!tty)
+		return;
+
+	disc = tty_ldisc_ref(tty);
+	if (!disc)
+		return;
+
+	if (disc->ops->lookahead_buf)
+		disc->ops->lookahead_buf(disc->tty, p, f, count);
+
+	tty_ldisc_deref(disc);
+}
+
 static void tty_port_default_wakeup(struct tty_port *port)
 {
 	struct tty_struct *tty = tty_port_tty_get(port);
@@ -55,6 +75,7 @@ static void tty_port_default_wakeup(struct tty_port *port)
 
 const struct tty_port_client_operations tty_port_default_client_ops = {
 	.receive_buf = tty_port_default_receive_buf,
+	.lookahead_buf = tty_port_default_lookahead_buf,
 	.write_wakeup = tty_port_default_wakeup,
 };
 EXPORT_SYMBOL_GPL(tty_port_default_client_ops);
diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h
index 3b9d77604291..1796648c2907 100644
--- a/include/linux/tty_buffer.h
+++ b/include/linux/tty_buffer.h
@@ -15,6 +15,7 @@ struct tty_buffer {
 	int used;
 	int size;
 	int commit;
+	int lookahead;		/* Lazy update on recv, can become less than "read" */
 	int read;
 	int flags;
 	/* Data points here */
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index e85002b56752..33678e1936f6 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -186,6 +186,18 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	indicate all data received is %TTY_NORMAL. If assigned, prefer this
  *	function for automatic flow control.
  *
+ * @lookahead_buf: [DRV] ``void ()(struct tty_struct *tty,
+ *			const unsigned char *cp, const char *fp, int count)
+ *
+ *	This function is called by the low-level tty driver for characters
+ *	not eaten by ->receive_buf() or ->receive_buf2(). It is useful for
+ *	processing high-priority characters such as software flow-control
+ *	characters that could otherwise get stuck into the intermediate
+ *	buffer until tty has room to receive them. Ldisc must be able to
+ *	handle later a ->receive_buf() or ->receive_buf2() call for the
+ *	same characters (e.g. by skipping the actions for high-priority
+ *	characters already handled by ->lookahead_buf()).
+ *
  * @owner: module containting this ldisc (for reference counting)
  *
  * This structure defines the interface between the tty line discipline
@@ -229,6 +241,8 @@ struct tty_ldisc_ops {
 	void	(*dcd_change)(struct tty_struct *tty, unsigned int status);
 	int	(*receive_buf2)(struct tty_struct *tty, const unsigned char *cp,
 				const char *fp, int count);
+	void	(*lookahead_buf)(struct tty_struct *tty, const unsigned char *cp,
+				 const unsigned char *fp, unsigned int count);
 
 	struct  module *owner;
 };
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 58e9619116b7..fa3c3bdaa234 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -40,6 +40,8 @@ struct tty_port_operations {
 
 struct tty_port_client_operations {
 	int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t);
+	void (*lookahead_buf)(struct tty_port *port, const unsigned char *cp,
+			      const unsigned char *fp, unsigned int count);
 	void (*write_wakeup)(struct tty_port *port);
 };
 
-- 
cgit v1.2.3


From 0eb6584068642767baab17c2e4385a6b9c029caa Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:36 +0200
Subject: platform/surface: aggregator: Allow is_ssam_device() to be used when
 CONFIG_SURFACE_AGGREGATOR_BUS is disabled

In SSAM subsystem drivers that handle both ACPI and SSAM-native client
devices, we may want to check whether we have a SSAM (native) client
device. Further, we may want to do this even when instantiation thereof
cannot happen due to CONFIG_SURFACE_AGGREGATOR_BUS=n. Currently, doing
so causes an error due to an undefined reference error due to
ssam_device_type being placed in the bus source unit.

Therefore, if CONFIG_SURFACE_AGGREGATOR_BUS is not defined, simply let
is_ssam_device() return false to prevent this error.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/device.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index cc257097eb05..62b38b4487eb 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -177,6 +177,8 @@ struct ssam_device_driver {
 	void (*remove)(struct ssam_device *sdev);
 };
 
+#ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+
 extern struct bus_type ssam_bus_type;
 extern const struct device_type ssam_device_type;
 
@@ -193,6 +195,15 @@ static inline bool is_ssam_device(struct device *d)
 	return d->type == &ssam_device_type;
 }
 
+#else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+static inline bool is_ssam_device(struct device *d)
+{
+	return false;
+}
+
+#endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
 /**
  * to_ssam_device() - Casts the given device to a SSAM client device.
  * @d: The device to cast.
-- 
cgit v1.2.3


From dc0393c76f378f68961587fd4f32de29fb8f0c79 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:37 +0200
Subject: platform/surface: aggregator: Allow devices to be marked as
 hot-removed

Some SSAM devices, notably the keyboard cover (keyboard and touchpad) on
the Surface Pro 8, can be hot-removed. When this occurs, communication
with the device may fail and time out. This timeout can unnecessarily
block and slow down device removal and even cause issues when the
devices are detached and re-attached quickly. Thus, communication should
generally be avoided once hot-removal is detected.

While we already remove a device as soon as we detect its (hot-)removal,
the corresponding device driver may still attempt to communicate with
the device during teardown. This is especially critical as communication
failure may also extend to disabling of events, which is typically done
at that stage.

Add a flag to allow marking devices as hot-removed. This can then be
used during client driver teardown to check if any communication
attempts should be avoided.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-3-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/device.h | 48 +++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 62b38b4487eb..6df7c8d4e50e 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -148,17 +148,30 @@ struct ssam_device_uid {
 #define SSAM_SDEV(cat, tid, iid, fun) \
 	SSAM_DEVICE(SSAM_DOMAIN_SERIALHUB, SSAM_SSH_TC_##cat, tid, iid, fun)
 
+/*
+ * enum ssam_device_flags - Flags for SSAM client devices.
+ * @SSAM_DEVICE_HOT_REMOVED_BIT:
+ *	The device has been hot-removed. Further communication with it may time
+ *	out and should be avoided.
+ */
+enum ssam_device_flags {
+	SSAM_DEVICE_HOT_REMOVED_BIT = 0,
+};
+
 /**
  * struct ssam_device - SSAM client device.
- * @dev:  Driver model representation of the device.
- * @ctrl: SSAM controller managing this device.
- * @uid:  UID identifying the device.
+ * @dev:   Driver model representation of the device.
+ * @ctrl:  SSAM controller managing this device.
+ * @uid:   UID identifying the device.
+ * @flags: Device state flags, see &enum ssam_device_flags.
  */
 struct ssam_device {
 	struct device dev;
 	struct ssam_controller *ctrl;
 
 	struct ssam_device_uid uid;
+
+	unsigned long flags;
 };
 
 /**
@@ -251,6 +264,35 @@ struct ssam_device *ssam_device_alloc(struct ssam_controller *ctrl,
 int ssam_device_add(struct ssam_device *sdev);
 void ssam_device_remove(struct ssam_device *sdev);
 
+/**
+ * ssam_device_mark_hot_removed() - Mark the given device as hot-removed.
+ * @sdev: The device to mark as hot-removed.
+ *
+ * Mark the device as having been hot-removed. This signals drivers using the
+ * device that communication with the device should be avoided and may lead to
+ * timeouts.
+ */
+static inline void ssam_device_mark_hot_removed(struct ssam_device *sdev)
+{
+	dev_dbg(&sdev->dev, "marking device as hot-removed\n");
+	set_bit(SSAM_DEVICE_HOT_REMOVED_BIT, &sdev->flags);
+}
+
+/**
+ * ssam_device_is_hot_removed() - Check if the given device has been
+ * hot-removed.
+ * @sdev: The device to check.
+ *
+ * Checks if the given device has been marked as hot-removed. See
+ * ssam_device_mark_hot_removed() for more details.
+ *
+ * Return: Returns ``true`` if the device has been marked as hot-removed.
+ */
+static inline bool ssam_device_is_hot_removed(struct ssam_device *sdev)
+{
+	return test_bit(SSAM_DEVICE_HOT_REMOVED_BIT, &sdev->flags);
+}
+
 /**
  * ssam_device_get() - Increment reference count of SSAM client device.
  * @sdev: The device to increment the reference count of.
-- 
cgit v1.2.3


From 5c1e88b98c60e4074796e9a05d3c674479ab1919 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:38 +0200
Subject: platform/surface: aggregator: Allow notifiers to avoid communication
 on unregistering

When SSAM client devices have been (physically) hot-removed,
communication attempts with those devices may fail and time out. This
can even extend to event notifiers, due to which timeouts may occur
during device removal, slowing down that process.

Add a parameter to the notifier unregister function that allows skipping
communication with the EC to prevent this. Furthermore, add wrappers for
registering and unregistering notifiers belonging to SSAM client devices
that automatically check if the device has been marked as hot-removed
and communication should be avoided.

Note that non-SSAM client devices can generally not be hot-removed, so
also add a convenience wrapper for those, defaulting to allow
communication.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-4-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../driver-api/surface_aggregator/client.rst       |  6 +-
 drivers/platform/surface/aggregator/controller.c   | 53 +++++++++++------
 include/linux/surface_aggregator/controller.h      | 24 +++++++-
 include/linux/surface_aggregator/device.h          | 66 ++++++++++++++++++++++
 4 files changed, 128 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/surface_aggregator/client.rst b/Documentation/driver-api/surface_aggregator/client.rst
index e519d374c378..27f95abdbe99 100644
--- a/Documentation/driver-api/surface_aggregator/client.rst
+++ b/Documentation/driver-api/surface_aggregator/client.rst
@@ -17,6 +17,8 @@
 .. |SSAM_DEVICE| replace:: :c:func:`SSAM_DEVICE`
 .. |ssam_notifier_register| replace:: :c:func:`ssam_notifier_register`
 .. |ssam_notifier_unregister| replace:: :c:func:`ssam_notifier_unregister`
+.. |ssam_device_notifier_register| replace:: :c:func:`ssam_device_notifier_register`
+.. |ssam_device_notifier_unregister| replace:: :c:func:`ssam_device_notifier_unregister`
 .. |ssam_request_sync| replace:: :c:func:`ssam_request_sync`
 .. |ssam_event_mask| replace:: :c:type:`enum ssam_event_mask <ssam_event_mask>`
 
@@ -312,7 +314,9 @@ Handling Events
 To receive events from the SAM EC, an event notifier must be registered for
 the desired event via |ssam_notifier_register|. The notifier must be
 unregistered via |ssam_notifier_unregister| once it is not required any
-more.
+more. For |ssam_device| type clients, the |ssam_device_notifier_register| and
+|ssam_device_notifier_unregister| wrappers should be preferred as they properly
+handle hot-removal of client devices.
 
 Event notifiers are registered by providing (at minimum) a callback to call
 in case an event has been received, the registry specifying how the event
diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index b8c377b3f932..6de834b52b63 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -2199,16 +2199,26 @@ static int ssam_nf_refcount_enable(struct ssam_controller *ctrl,
 }
 
 /**
- * ssam_nf_refcount_disable_free() - Disable event for reference count entry if it is
- * no longer in use and free the corresponding entry.
+ * ssam_nf_refcount_disable_free() - Disable event for reference count entry if
+ * it is no longer in use and free the corresponding entry.
  * @ctrl:  The controller to disable the event on.
  * @entry: The reference count entry for the event to be disabled.
  * @flags: The flags used for enabling the event on the EC.
+ * @ec:    Flag specifying if the event should actually be disabled on the EC.
  *
- * If the reference count equals zero, i.e. the event is no longer requested by
- * any client, the event will be disabled and the corresponding reference count
- * entry freed. The reference count entry must not be used any more after a
- * call to this function.
+ * If ``ec`` equals ``true`` and the reference count equals zero (i.e. the
+ * event is no longer requested by any client), the specified event will be
+ * disabled on the EC via the corresponding request.
+ *
+ * If ``ec`` equals ``false``, no request will be sent to the EC and the event
+ * can be considered in a detached state (i.e. no longer used but still
+ * enabled). Disabling an event via this method may be required for
+ * hot-removable devices, where event disable requests may time out after the
+ * device has been physically removed.
+ *
+ * In both cases, if the reference count equals zero, the corresponding
+ * reference count entry will be freed. The reference count entry must not be
+ * used any more after a call to this function.
  *
  * Also checks if the flags used for disabling the event match the flags used
  * for enabling the event and warns if they do not (regardless of reference
@@ -2223,7 +2233,7 @@ static int ssam_nf_refcount_enable(struct ssam_controller *ctrl,
  * returns the status of the event-enable EC command.
  */
 static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
-					 struct ssam_nf_refcount_entry *entry, u8 flags)
+					 struct ssam_nf_refcount_entry *entry, u8 flags, bool ec)
 {
 	const struct ssam_event_registry reg = entry->key.reg;
 	const struct ssam_event_id id = entry->key.id;
@@ -2232,8 +2242,9 @@ static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
 
 	lockdep_assert_held(&nf->lock);
 
-	ssam_dbg(ctrl, "disabling event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
-		 reg.target_category, id.target_category, id.instance, entry->refcount);
+	ssam_dbg(ctrl, "%s event (reg: %#04x, tc: %#04x, iid: %#04x, rc: %d)\n",
+		 ec ? "disabling" : "detaching", reg.target_category, id.target_category,
+		 id.instance, entry->refcount);
 
 	if (entry->flags != flags) {
 		ssam_warn(ctrl,
@@ -2242,7 +2253,7 @@ static int ssam_nf_refcount_disable_free(struct ssam_controller *ctrl,
 			  id.instance);
 	}
 
-	if (entry->refcount == 0) {
+	if (ec && entry->refcount == 0) {
 		status = ssam_ssh_event_disable(ctrl, reg, id, flags);
 		kfree(entry);
 	}
@@ -2322,20 +2333,26 @@ int ssam_notifier_register(struct ssam_controller *ctrl, struct ssam_event_notif
 EXPORT_SYMBOL_GPL(ssam_notifier_register);
 
 /**
- * ssam_notifier_unregister() - Unregister an event notifier.
- * @ctrl: The controller the notifier has been registered on.
- * @n:    The event notifier to unregister.
+ * __ssam_notifier_unregister() - Unregister an event notifier.
+ * @ctrl:    The controller the notifier has been registered on.
+ * @n:       The event notifier to unregister.
+ * @disable: Whether to disable the corresponding event on the EC.
  *
  * Unregister an event notifier. Decrement the usage counter of the associated
  * SAM event if the notifier is not marked as an observer. If the usage counter
- * reaches zero, the event will be disabled.
+ * reaches zero and ``disable`` equals ``true``, the event will be disabled.
+ *
+ * Useful for hot-removable devices, where communication may fail once the
+ * device has been physically removed. In that case, specifying ``disable`` as
+ * ``false`` avoids communication with the EC.
  *
  * Return: Returns zero on success, %-ENOENT if the given notifier block has
  * not been registered on the controller. If the given notifier block was the
  * last one associated with its specific event, returns the status of the
  * event-disable EC-command.
  */
-int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_notifier *n)
+int __ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_notifier *n,
+			       bool disable)
 {
 	u16 rqid = ssh_tc_to_rqid(n->event.id.target_category);
 	struct ssam_nf_refcount_entry *entry;
@@ -2373,7 +2390,7 @@ int ssam_notifier_unregister(struct ssam_controller *ctrl, struct ssam_event_not
 			goto remove;
 		}
 
-		status = ssam_nf_refcount_disable_free(ctrl, entry, n->event.flags);
+		status = ssam_nf_refcount_disable_free(ctrl, entry, n->event.flags, disable);
 	}
 
 remove:
@@ -2383,7 +2400,7 @@ remove:
 
 	return status;
 }
-EXPORT_SYMBOL_GPL(ssam_notifier_unregister);
+EXPORT_SYMBOL_GPL(__ssam_notifier_unregister);
 
 /**
  * ssam_controller_event_enable() - Enable the specified event.
@@ -2477,7 +2494,7 @@ int ssam_controller_event_disable(struct ssam_controller *ctrl,
 		return -ENOENT;
 	}
 
-	status = ssam_nf_refcount_disable_free(ctrl, entry, flags);
+	status = ssam_nf_refcount_disable_free(ctrl, entry, flags, true);
 
 	mutex_unlock(&nf->lock);
 	return status;
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 74bfdffaf7b0..50a2b4926c06 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -835,8 +835,28 @@ struct ssam_event_notifier {
 int ssam_notifier_register(struct ssam_controller *ctrl,
 			   struct ssam_event_notifier *n);
 
-int ssam_notifier_unregister(struct ssam_controller *ctrl,
-			     struct ssam_event_notifier *n);
+int __ssam_notifier_unregister(struct ssam_controller *ctrl,
+			       struct ssam_event_notifier *n, bool disable);
+
+/**
+ * ssam_notifier_unregister() - Unregister an event notifier.
+ * @ctrl:    The controller the notifier has been registered on.
+ * @n:       The event notifier to unregister.
+ *
+ * Unregister an event notifier. Decrement the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the usage counter
+ * reaches zero, the event will be disabled.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given notifier block has
+ * not been registered on the controller. If the given notifier block was the
+ * last one associated with its specific event, returns the status of the
+ * event-disable EC-command.
+ */
+static inline int ssam_notifier_unregister(struct ssam_controller *ctrl,
+					   struct ssam_event_notifier *n)
+{
+	return __ssam_notifier_unregister(ctrl, n, true);
+}
 
 int ssam_controller_event_enable(struct ssam_controller *ctrl,
 				 struct ssam_event_registry reg,
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 6df7c8d4e50e..c418f7f2732d 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -483,4 +483,70 @@ static inline void ssam_remove_clients(struct device *dev) {}
 				    sdev->uid.instance, ret);		\
 	}
 
+
+/* -- Helpers for client-device notifiers. ---------------------------------- */
+
+/**
+ * ssam_device_notifier_register() - Register an event notifier for the
+ * specified client device.
+ * @sdev: The device the notifier should be registered on.
+ * @n:    The event notifier to register.
+ *
+ * Register an event notifier. Increment the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the event is not
+ * marked as an observer and is currently not enabled, it will be enabled
+ * during this call. If the notifier is marked as an observer, no attempt will
+ * be made at enabling any event and no reference count will be modified.
+ *
+ * Notifiers marked as observers do not need to be associated with one specific
+ * event, i.e. as long as no event matching is performed, only the event target
+ * category needs to be set.
+ *
+ * Return: Returns zero on success, %-ENOSPC if there have already been
+ * %INT_MAX notifiers for the event ID/type associated with the notifier block
+ * registered, %-ENOMEM if the corresponding event entry could not be
+ * allocated, %-ENODEV if the device is marked as hot-removed. If this is the
+ * first time that a notifier block is registered for the specific associated
+ * event, returns the status of the event-enable EC-command.
+ */
+static inline int ssam_device_notifier_register(struct ssam_device *sdev,
+						struct ssam_event_notifier *n)
+{
+	/*
+	 * Note that this check does not provide any guarantees whatsoever as
+	 * hot-removal could happen at any point and we can't protect against
+	 * it. Nevertheless, if we can detect hot-removal, bail early to avoid
+	 * communication timeouts.
+	 */
+	if (ssam_device_is_hot_removed(sdev))
+		return -ENODEV;
+
+	return ssam_notifier_register(sdev->ctrl, n);
+}
+
+/**
+ * ssam_device_notifier_unregister() - Unregister an event notifier for the
+ * specified client device.
+ * @sdev: The device the notifier has been registered on.
+ * @n:    The event notifier to unregister.
+ *
+ * Unregister an event notifier. Decrement the usage counter of the associated
+ * SAM event if the notifier is not marked as an observer. If the usage counter
+ * reaches zero, the event will be disabled.
+ *
+ * In case the device has been marked as hot-removed, the event will not be
+ * disabled on the EC, as in those cases any attempt at doing so may time out.
+ *
+ * Return: Returns zero on success, %-ENOENT if the given notifier block has
+ * not been registered on the controller. If the given notifier block was the
+ * last one associated with its specific event, returns the status of the
+ * event-disable EC-command.
+ */
+static inline int ssam_device_notifier_unregister(struct ssam_device *sdev,
+						  struct ssam_event_notifier *n)
+{
+	return __ssam_notifier_unregister(sdev->ctrl, n,
+					  !ssam_device_is_hot_removed(sdev));
+}
+
 #endif /* _LINUX_SURFACE_AGGREGATOR_DEVICE_H */
-- 
cgit v1.2.3


From 25e2ca7301bd3ca5a63a6be41d729eb42202bc21 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 27 May 2022 04:34:43 +0200
Subject: platform/surface: aggregator: Add comment for KIP subsystem category

The KIP subsystem (full name unknown, abbreviation has been obtained
through reverse engineering) handles detachable peripherals such as the
keyboard cover on the Surface Pro X and Surface Pro 8.

It is currently not entirely clear what this subsystem entails, but at
the very least it provides event notifications for when the keyboard
cover on the Surface Pro X and Surface Pro 8 have been detached or
re-attached, as well as the state that the keyboard cover is currently
in (e.g. folded-back, folded laptop-like, closed, etc.).

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220527023447.2460025-9-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/serial_hub.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index c3de43edcffa..26b95ec12733 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -306,7 +306,7 @@ enum ssam_ssh_tc {
 	SSAM_SSH_TC_LPC = 0x0b,
 	SSAM_SSH_TC_TCL = 0x0c,
 	SSAM_SSH_TC_SFL = 0x0d,
-	SSAM_SSH_TC_KIP = 0x0e,
+	SSAM_SSH_TC_KIP = 0x0e,	/* Manages detachable peripherals (Pro X/8 keyboard cover) */
 	SSAM_SSH_TC_EXT = 0x0f,
 	SSAM_SSH_TC_BLD = 0x10,
 	SSAM_SSH_TC_BAS = 0x11,	/* Detachment system (Surface Book 2/3). */
-- 
cgit v1.2.3


From 064520e8aeaa2569f6504a50a37ac801b73656bc Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Wed, 15 Jun 2022 16:43:48 +0800
Subject: ASoC: SOF: Intel: Add support for MeteorLake (MTL)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add platform abstraction for the Meteor Lake platform.

This platform has significant differences compared to the TGL/ADL
generation: it relies on new hardware using the code name 'ACE' and
only supports the INTEL_IPC4 protocol and firmware architecture based
on the Zephyr RTOS

Co-developed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Link: https://lore.kernel.org/r/20220615084348.3489-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/soundwire/sdw_intel.h |   2 +
 sound/soc/sof/intel/Kconfig         |  16 +
 sound/soc/sof/intel/Makefile        |   4 +-
 sound/soc/sof/intel/hda.h           |   3 +
 sound/soc/sof/intel/mtl.c           | 800 ++++++++++++++++++++++++++++++++++++
 sound/soc/sof/intel/mtl.h           |  76 ++++
 sound/soc/sof/intel/pci-mtl.c       |  71 ++++
 sound/soc/sof/intel/shim.h          |   1 +
 8 files changed, 972 insertions(+), 1 deletion(-)
 create mode 100644 sound/soc/sof/intel/mtl.c
 create mode 100644 sound/soc/sof/intel/mtl.h
 create mode 100644 sound/soc/sof/intel/pci-mtl.c

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 67e0d3e750b5..b5b489ea1aef 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -9,6 +9,8 @@
 
 #define SDW_SHIM_BASE			0x2C000
 #define SDW_ALH_BASE			0x2C800
+#define SDW_SHIM_BASE_ACE		0x38000
+#define SDW_ALH_BASE_ACE		0x24000
 #define SDW_LINK_BASE			0x30000
 #define SDW_LINK_SIZE			0x10000
 
diff --git a/sound/soc/sof/intel/Kconfig b/sound/soc/sof/intel/Kconfig
index 80cdc3788bbe..3f54678e810b 100644
--- a/sound/soc/sof/intel/Kconfig
+++ b/sound/soc/sof/intel/Kconfig
@@ -221,6 +221,22 @@ config SND_SOC_SOF_ALDERLAKE
 	  Say Y if you have such a device.
 	  If unsure select "N".
 
+config SND_SOC_SOF_INTEL_MTL
+	tristate
+	select SND_SOC_SOF_HDA_COMMON
+	select SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE
+	select SND_SOC_SOF_INTEL_IPC4
+
+config SND_SOC_SOF_METEORLAKE
+	tristate "SOF support for Meteorlake"
+	default SND_SOC_SOF_PCI
+	select SND_SOC_SOF_INTEL_MTL
+	help
+	  This adds support for Sound Open Firmware for Intel(R) platforms
+	  using the Meteorlake processors.
+	  Say Y if you have such a device.
+	  If unsure select "N".
+
 config SND_SOC_SOF_HDA_COMMON
 	tristate
 	select SND_SOC_SOF_INTEL_COMMON
diff --git a/sound/soc/sof/intel/Makefile b/sound/soc/sof/intel/Makefile
index b9d51dc39ffa..a079159bb2f0 100644
--- a/sound/soc/sof/intel/Makefile
+++ b/sound/soc/sof/intel/Makefile
@@ -6,7 +6,7 @@ snd-sof-acpi-intel-bdw-objs := bdw.o
 snd-sof-intel-hda-common-objs := hda.o hda-loader.o hda-stream.o hda-trace.o \
 				 hda-dsp.o hda-ipc.o hda-ctrl.o hda-pcm.o \
 				 hda-dai.o hda-bus.o \
-				 apl.o cnl.o tgl.o icl.o hda-common-ops.o
+				 apl.o cnl.o tgl.o icl.o mtl.o hda-common-ops.o
 snd-sof-intel-hda-common-$(CONFIG_SND_SOC_SOF_HDA_PROBES) += hda-probes.o
 
 snd-sof-intel-hda-objs := hda-codec.o
@@ -24,9 +24,11 @@ snd-sof-pci-intel-apl-objs := pci-apl.o
 snd-sof-pci-intel-cnl-objs := pci-cnl.o
 snd-sof-pci-intel-icl-objs := pci-icl.o
 snd-sof-pci-intel-tgl-objs := pci-tgl.o
+snd-sof-pci-intel-mtl-objs := pci-mtl.o
 
 obj-$(CONFIG_SND_SOC_SOF_MERRIFIELD) += snd-sof-pci-intel-tng.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_APL) += snd-sof-pci-intel-apl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_CNL) += snd-sof-pci-intel-cnl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_ICL) += snd-sof-pci-intel-icl.o
 obj-$(CONFIG_SND_SOC_SOF_INTEL_TGL) += snd-sof-pci-intel-tgl.o
+obj-$(CONFIG_SND_SOC_SOF_INTEL_MTL) += snd-sof-pci-intel-mtl.o
diff --git a/sound/soc/sof/intel/hda.h b/sound/soc/sof/intel/hda.h
index 8b7f3c07d478..a3118499e34f 100644
--- a/sound/soc/sof/intel/hda.h
+++ b/sound/soc/sof/intel/hda.h
@@ -714,6 +714,8 @@ extern struct snd_sof_dsp_ops sof_tgl_ops;
 int sof_tgl_ops_init(struct snd_sof_dev *sdev);
 extern struct snd_sof_dsp_ops sof_icl_ops;
 int sof_icl_ops_init(struct snd_sof_dev *sdev);
+extern struct snd_sof_dsp_ops sof_mtl_ops;
+int sof_mtl_ops_init(struct snd_sof_dev *sdev);
 
 extern const struct sof_intel_dsp_desc apl_chip_info;
 extern const struct sof_intel_dsp_desc cnl_chip_info;
@@ -723,6 +725,7 @@ extern const struct sof_intel_dsp_desc tglh_chip_info;
 extern const struct sof_intel_dsp_desc ehl_chip_info;
 extern const struct sof_intel_dsp_desc jsl_chip_info;
 extern const struct sof_intel_dsp_desc adls_chip_info;
+extern const struct sof_intel_dsp_desc mtl_chip_info;
 
 /* Probes support */
 #if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA_PROBES)
diff --git a/sound/soc/sof/intel/mtl.c b/sound/soc/sof/intel/mtl.c
new file mode 100644
index 000000000000..37be77beb415
--- /dev/null
+++ b/sound/soc/sof/intel/mtl.c
@@ -0,0 +1,800 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+//
+// Copyright(c) 2022 Intel Corporation. All rights reserved.
+//
+// Authors: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
+//
+
+/*
+ * Hardware interface for audio DSP on Meteorlake.
+ */
+
+#include <linux/firmware.h>
+#include <sound/sof/ipc4/header.h>
+#include "../ipc4-priv.h"
+#include "../ops.h"
+#include "hda.h"
+#include "hda-ipc.h"
+#include "../sof-audio.h"
+#include "mtl.h"
+
+static const struct snd_sof_debugfs_map mtl_dsp_debugfs[] = {
+	{"hda", HDA_DSP_HDA_BAR, 0, 0x4000, SOF_DEBUGFS_ACCESS_ALWAYS},
+	{"pp", HDA_DSP_PP_BAR,  0, 0x1000, SOF_DEBUGFS_ACCESS_ALWAYS},
+	{"dsp", HDA_DSP_BAR,  0, 0x10000, SOF_DEBUGFS_ACCESS_ALWAYS},
+};
+
+static void mtl_ipc_host_done(struct snd_sof_dev *sdev)
+{
+	/*
+	 * clear busy interrupt to tell dsp controller this interrupt has been accepted,
+	 * not trigger it again
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR,
+				       MTL_DSP_REG_HFIPCXTDR_BUSY, MTL_DSP_REG_HFIPCXTDR_BUSY);
+	/*
+	 * clear busy bit to ack dsp the msg has been processed and send reply msg to dsp
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDA,
+				       MTL_DSP_REG_HFIPCXTDA_BUSY, 0);
+}
+
+static void mtl_ipc_dsp_done(struct snd_sof_dev *sdev)
+{
+	/*
+	 * set DONE bit - tell DSP we have received the reply msg from DSP, and processed it,
+	 * don't send more reply to host
+	 */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA,
+				       MTL_DSP_REG_HFIPCXIDA_DONE, MTL_DSP_REG_HFIPCXIDA_DONE);
+
+	/* unmask Done interrupt */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL,
+				MTL_DSP_REG_HFIPCXCTL_DONE, MTL_DSP_REG_HFIPCXCTL_DONE);
+}
+
+/* Check if an IPC IRQ occurred */
+static bool mtl_dsp_check_ipc_irq(struct snd_sof_dev *sdev)
+{
+	u32 irq_status;
+	u32 hfintipptr;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+	irq_status = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr + MTL_DSP_IRQSTS);
+
+	dev_vdbg(sdev->dev, "irq handler: irq_status:0x%x\n", irq_status);
+
+	if (irq_status != U32_MAX && (irq_status & MTL_DSP_IRQSTS_IPC))
+		return true;
+
+	return false;
+}
+
+/* Check if an SDW IRQ occurred */
+static bool mtl_dsp_check_sdw_irq(struct snd_sof_dev *sdev)
+{
+	u32 irq_status;
+	u32 hfintipptr;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+	irq_status = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr + MTL_DSP_IRQSTS);
+
+	if (irq_status != U32_MAX && (irq_status & MTL_DSP_IRQSTS_SDW))
+		return true;
+
+	return false;
+}
+
+static int mtl_ipc_send_msg(struct snd_sof_dev *sdev, struct snd_sof_ipc_msg *msg)
+{
+	struct sof_ipc4_msg *msg_data = msg->msg_data;
+
+	/* send the message via mailbox */
+	if (msg_data->data_size)
+		sof_mailbox_write(sdev, sdev->host_box.offset, msg_data->data_ptr,
+				  msg_data->data_size);
+
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDDY,
+			  msg_data->extension);
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDR,
+			  msg_data->primary | MTL_DSP_REG_HFIPCXIDR_BUSY);
+
+	return 0;
+}
+
+static void mtl_enable_ipc_interrupts(struct snd_sof_dev *sdev)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+
+	/* enable IPC DONE and BUSY interrupts */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, chip->ipc_ctl,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE);
+}
+
+static void mtl_disable_ipc_interrupts(struct snd_sof_dev *sdev)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+
+	/* disable IPC DONE and BUSY interrupts */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, chip->ipc_ctl,
+				MTL_DSP_REG_HFIPCXCTL_BUSY | MTL_DSP_REG_HFIPCXCTL_DONE, 0);
+}
+
+static int mtl_enable_interrupts(struct snd_sof_dev *sdev)
+{
+	u32 hfintipptr;
+	u32 irqinten;
+	u32 host_ipc;
+	u32 hipcie;
+	int ret;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+
+	/* Enable Host IPC and SOUNDWIRE */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, hfintipptr,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK);
+
+	/* check if operation was successful */
+	host_ipc = MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK;
+	irqinten = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, hfintipptr, irqinten,
+					    (irqinten & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to enable Host IPC and/or SOUNDWIRE\n");
+		return ret;
+	}
+
+	/* Set Host IPC interrupt enable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE,
+				MTL_DSP_REG_HfHIPCIE_IE_MASK, MTL_DSP_REG_HfHIPCIE_IE_MASK);
+
+	/* check if operation was successful */
+	host_ipc = MTL_DSP_REG_HfHIPCIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE, hipcie,
+					    (hipcie & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to set Host IPC interrupt enable\n");
+		return ret;
+	}
+
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE,
+				MTL_DSP_REG_HfSNDWIE_IE_MASK, MTL_DSP_REG_HfSNDWIE_IE_MASK);
+	host_ipc = MTL_DSP_REG_HfSNDWIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE, hipcie,
+					    (hipcie & host_ipc) == host_ipc,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to set SoundWire IPC interrupt enable\n");
+
+	return ret;
+}
+
+static int mtl_disable_interrupts(struct snd_sof_dev *sdev)
+{
+	u32 hfintipptr;
+	u32 irqinten;
+	u32 host_ipc;
+	u32 hipcie;
+	int ret1;
+	int ret;
+
+	/* read Interrupt IP Pointer */
+	hfintipptr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFINTIPPTR) & MTL_HFINTIPPTR_PTR_MASK;
+
+	/* Disable Host IPC and SOUNDWIRE */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, hfintipptr,
+				MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK, 0);
+
+	/* check if operation was successful */
+	host_ipc = MTL_IRQ_INTEN_L_HOST_IPC_MASK | MTL_IRQ_INTEN_L_SOUNDWIRE_MASK;
+	irqinten = snd_sof_dsp_read(sdev, HDA_DSP_BAR, hfintipptr);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, hfintipptr, irqinten,
+					    (irqinten & host_ipc) == 0,
+					    HDA_DSP_REG_POLL_INTERVAL_US, HDA_DSP_RESET_TIMEOUT_US);
+	/* Continue to disable other interrupts when error happens */
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to disable Host IPC and SoundWire\n");
+
+	/* Set Host IPC interrupt disable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE,
+				MTL_DSP_REG_HfHIPCIE_IE_MASK, 0);
+
+	/* check if operation was successful */
+	host_ipc = MTL_DSP_REG_HfHIPCIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE);
+	ret1 = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfHIPCIE, hipcie,
+					     (hipcie & host_ipc) == 0,
+					     HDA_DSP_REG_POLL_INTERVAL_US,
+					     HDA_DSP_RESET_TIMEOUT_US);
+	if (ret1 < 0) {
+		dev_err(sdev->dev, "failed to set Host IPC interrupt disable\n");
+		if (!ret)
+			ret = ret1;
+	}
+
+	/* Set SoundWire IPC interrupt disable */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE,
+				MTL_DSP_REG_HfSNDWIE_IE_MASK, 0);
+	host_ipc = MTL_DSP_REG_HfSNDWIE_IE_MASK;
+	hipcie = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE);
+	ret1 = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP_REG_HfSNDWIE, hipcie,
+					     (hipcie & host_ipc) == 0,
+					     HDA_DSP_REG_POLL_INTERVAL_US,
+					     HDA_DSP_RESET_TIMEOUT_US);
+	if (ret1 < 0) {
+		dev_err(sdev->dev, "failed to set SoundWire IPC interrupt disable\n");
+		if (!ret)
+			ret = ret1;
+	}
+
+	return ret;
+}
+
+/* pre fw run operations */
+static int mtl_dsp_pre_fw_run(struct snd_sof_dev *sdev)
+{
+	u32 dsphfpwrsts;
+	u32 dsphfdsscs;
+	u32 cpa;
+	u32 pgs;
+	int ret;
+
+	/* Set the DSP subsystem power on */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFDSSCS,
+				MTL_HFDSSCS_SPA_MASK, MTL_HFDSSCS_SPA_MASK);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_HFDSSCS_CPA_MASK;
+	dsphfdsscs = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFDSSCS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFDSSCS, dsphfdsscs,
+					    (dsphfdsscs & cpa) == cpa, HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "failed to enable DSP subsystem\n");
+		return ret;
+	}
+
+	/* Power up gated-DSP-0 domain in order to access the DSP shim register block. */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFPWRCTL,
+				MTL_HFPWRCTL_WPDSPHPXPG, MTL_HFPWRCTL_WPDSPHPXPG);
+
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	pgs = MTL_HFPWRSTS_DSPHPXPGS_MASK;
+	dsphfpwrsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFPWRSTS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFPWRSTS, dsphfpwrsts,
+					    (dsphfpwrsts & pgs) == pgs,
+					    HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to power up gated DSP domain\n");
+
+	/* make sure SoundWire is not power-gated */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_HDA_BAR, MTL_HFPWRCTL,
+				MTL_HfPWRCTL_WPIOXPG(1), MTL_HfPWRCTL_WPIOXPG(1));
+	return ret;
+}
+
+static int mtl_dsp_post_fw_run(struct snd_sof_dev *sdev)
+{
+	int ret;
+
+	if (sdev->first_boot) {
+		struct sof_intel_hda_dev *hdev = sdev->pdata->hw_pdata;
+
+		ret = hda_sdw_startup(sdev);
+		if (ret < 0) {
+			dev_err(sdev->dev, "could not startup SoundWire links\n");
+			return ret;
+		}
+
+		/* Check if IMR boot is usable */
+		if (!sof_debug_check_flag(SOF_DBG_IGNORE_D3_PERSISTENT))
+			hdev->imrboot_supported = true;
+	}
+
+	hda_sdw_int_enable(sdev, true);
+	return 0;
+}
+
+static void mtl_dsp_dump(struct snd_sof_dev *sdev, u32 flags)
+{
+	char *level = (flags & SOF_DBG_DUMP_OPTIONAL) ? KERN_DEBUG : KERN_ERR;
+	u32 romdbgsts;
+	u32 romdbgerr;
+	u32 fwsts;
+	u32 fwlec;
+
+	fwsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_ROM_STS);
+	fwlec = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_ROM_ERROR);
+	romdbgsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY);
+	romdbgerr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY_ERROR);
+
+	dev_err(sdev->dev, "ROM status: %#x, ROM error: %#x\n", fwsts, fwlec);
+	dev_err(sdev->dev, "ROM debug status: %#x, ROM debug error: %#x\n", romdbgsts,
+		romdbgerr);
+	romdbgsts = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFFLGPXQWY + 0x8 * 3);
+	dev_printk(level, sdev->dev, "ROM feature bit%s enabled\n",
+		   romdbgsts & BIT(24) ? "" : " not");
+}
+
+static bool mtl_dsp_primary_core_is_enabled(struct snd_sof_dev *sdev)
+{
+	int val;
+
+	val = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE);
+	if (val != U32_MAX && val & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK)
+		return true;
+
+	return false;
+}
+
+static int mtl_dsp_core_power_up(struct snd_sof_dev *sdev, int core)
+{
+	unsigned int cpa;
+	u32 dspcxctl;
+	int ret;
+
+	/* Only the primary core can be powered up by the host */
+	if (core != SOF_DSP_PRIMARY_CORE || mtl_dsp_primary_core_is_enabled(sdev))
+		return 0;
+
+	/* Program the owner of the IP & shim registers (10: Host CPU) */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_OSEL,
+				0x2 << MTL_DSP2CXCTL_PRIMARY_CORE_OSEL_SHIFT);
+
+	/* enable SPA bit */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK;
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE, dspcxctl,
+					    (dspcxctl & cpa) == cpa, HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0) {
+		dev_err(sdev->dev, "%s: timeout on MTL_DSP2CXCTL_PRIMARY_CORE read\n",
+			__func__);
+		return ret;
+	}
+
+	/* did core power up ? */
+	dspcxctl = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE);
+	if ((dspcxctl & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK)
+		!= MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK) {
+		dev_err(sdev->dev, "power up core failed core %d adspcs %#x\n",
+			core, dspcxctl);
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+static int mtl_dsp_core_power_down(struct snd_sof_dev *sdev, int core)
+{
+	u32 dspcxctl;
+	int ret;
+
+	/* Only the primary core can be powered down by the host */
+	if (core != SOF_DSP_PRIMARY_CORE || !mtl_dsp_primary_core_is_enabled(sdev))
+		return 0;
+
+	/* disable SPA bit */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE,
+				MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK, 0);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_DSP2CXCTL_PRIMARY_CORE, dspcxctl,
+					    !(dspcxctl & MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK),
+					    HDA_DSP_REG_POLL_INTERVAL_US,
+					    HDA_DSP_PD_TIMEOUT * USEC_PER_MSEC);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to power down primary core\n");
+
+	return ret;
+}
+
+static int mtl_dsp_cl_init(struct snd_sof_dev *sdev, int stream_tag, bool imr_boot)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+	unsigned int status;
+	u32 ipc_hdr;
+	int ret;
+
+	/* step 1: purge FW request */
+	ipc_hdr = chip->ipc_req_mask | HDA_DSP_ROM_IPC_CONTROL;
+	if (!imr_boot)
+		ipc_hdr |= HDA_DSP_ROM_IPC_PURGE_FW | ((stream_tag - 1) << 9);
+
+	snd_sof_dsp_write(sdev, HDA_DSP_BAR, chip->ipc_req, ipc_hdr);
+
+	/* step 2: power up primary core */
+	ret = mtl_dsp_core_power_up(sdev, SOF_DSP_PRIMARY_CORE);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "dsp core 0/1 power up failed\n");
+		goto err;
+	}
+
+	dev_dbg(sdev->dev, "Primary core power up successful\n");
+
+	/* step 3: wait for IPC DONE bit from ROM */
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, chip->ipc_ack, status,
+					    ((status & chip->ipc_ack_mask) == chip->ipc_ack_mask),
+					    HDA_DSP_REG_POLL_INTERVAL_US, MTL_DSP_PURGE_TIMEOUT_US);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "timeout waiting for purge IPC done\n");
+		goto err;
+	}
+
+	/* set DONE bit to clear the reply IPC message */
+	snd_sof_dsp_update_bits_forced(sdev, HDA_DSP_BAR, chip->ipc_ack, chip->ipc_ack_mask,
+				       chip->ipc_ack_mask);
+
+	/* step 4: enable interrupts */
+	ret = mtl_enable_interrupts(sdev);
+	if (ret < 0) {
+		if (hda->boot_iteration == HDA_FW_BOOT_ATTEMPTS)
+			dev_err(sdev->dev, "%s: failed to enable interrupts\n", __func__);
+		goto err;
+	}
+
+	mtl_enable_ipc_interrupts(sdev);
+
+	/*
+	 * ACE workaround: don't wait for ROM INIT.
+	 * The platform cannot catch ROM_INIT_DONE because of a very short
+	 * timing window. Follow the recommendations and skip this part.
+	 */
+
+	return 0;
+
+err:
+	snd_sof_dsp_dbg_dump(sdev, "MTL DSP init fail", 0);
+	mtl_dsp_core_power_down(sdev, SOF_DSP_PRIMARY_CORE);
+	return ret;
+}
+
+static irqreturn_t mtl_ipc_irq_thread(int irq, void *context)
+{
+	struct sof_ipc4_msg notification_data = {{ 0 }};
+	struct snd_sof_dev *sdev = context;
+	bool ipc_irq = false;
+	u32 hipcida;
+	u32 hipctdr;
+
+	hipcida = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA);
+
+	/* reply message from DSP */
+	if (hipcida & MTL_DSP_REG_HFIPCXIDA_DONE) {
+		/* DSP received the message */
+		snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL,
+					MTL_DSP_REG_HFIPCXCTL_DONE, 0);
+
+		mtl_ipc_dsp_done(sdev);
+
+		ipc_irq = true;
+	}
+
+	hipctdr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR);
+	if (hipctdr & MTL_DSP_REG_HFIPCXTDR_BUSY) {
+		/* Message from DSP (reply or notification) */
+		u32 extension = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDDY);
+		u32 primary = hipctdr & MTL_DSP_REG_HFIPCXTDR_MSG_MASK;
+
+		/*
+		 * ACE fw sends a new fw ipc message to host to
+		 * notify the status of the last host ipc message
+		 */
+		if (primary & SOF_IPC4_MSG_DIR_MASK) {
+			/* Reply received */
+			struct sof_ipc4_msg *data = sdev->ipc->msg.reply_data;
+
+			data->primary = primary;
+			data->extension = extension;
+
+			spin_lock_irq(&sdev->ipc_lock);
+
+			snd_sof_ipc_get_reply(sdev);
+			snd_sof_ipc_reply(sdev, data->primary);
+
+			spin_unlock_irq(&sdev->ipc_lock);
+		} else {
+			/* Notification received */
+			notification_data.primary = primary;
+			notification_data.extension = extension;
+
+			sdev->ipc->msg.rx_data = &notification_data;
+			snd_sof_ipc_msgs_rx(sdev);
+			sdev->ipc->msg.rx_data = NULL;
+		}
+
+		mtl_ipc_host_done(sdev);
+
+		ipc_irq = true;
+	}
+
+	if (!ipc_irq) {
+		/* This interrupt is not shared so no need to return IRQ_NONE. */
+		dev_dbg_ratelimited(sdev->dev, "%s nothing to do in IPC IRQ thread\n",
+				    __func__);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int mtl_dsp_ipc_get_mailbox_offset(struct snd_sof_dev *sdev)
+{
+	return MTL_DSP_MBOX_UPLINK_OFFSET;
+}
+
+static int mtl_dsp_ipc_get_window_offset(struct snd_sof_dev *sdev, u32 id)
+{
+	return MTL_SRAM_WINDOW_OFFSET(id);
+}
+
+static int mtl_suspend(struct snd_sof_dev *sdev, bool runtime_suspend)
+{
+	struct sof_intel_hda_dev *hda = sdev->pdata->hw_pdata;
+	const struct sof_intel_dsp_desc *chip = hda->desc;
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	struct hdac_bus *bus = sof_to_bus(sdev);
+#endif
+	u32 dsphfdsscs;
+	u32 cpa;
+	int ret;
+	int i;
+
+	mtl_disable_ipc_interrupts(sdev);
+	ret = mtl_disable_interrupts(sdev);
+	if (ret)
+		return ret;
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	hda_codec_jack_wake_enable(sdev, runtime_suspend);
+	/* power down all hda link */
+	snd_hdac_ext_bus_link_power_down_all(bus);
+#endif
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFPWRCTL,
+				MTL_HFPWRCTL_WPDSPHPXPG, 0);
+
+	/* Set the DSP subsystem power down */
+	snd_sof_dsp_update_bits(sdev, HDA_DSP_BAR, MTL_HFDSSCS,
+				MTL_HFDSSCS_SPA_MASK, 0);
+
+	/* Wait for unstable CPA read (1 then 0 then 1) just after setting SPA bit */
+	usleep_range(1000, 1010);
+
+	/* poll with timeout to check if operation successful */
+	cpa = MTL_HFDSSCS_CPA_MASK;
+	dsphfdsscs = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_HFDSSCS);
+	ret = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR, MTL_HFDSSCS, dsphfdsscs,
+					    (dsphfdsscs & cpa) == 0, HDA_DSP_REG_POLL_INTERVAL_US,
+					HDA_DSP_RESET_TIMEOUT_US);
+	if (ret < 0)
+		dev_err(sdev->dev, "failed to disable DSP subsystem\n");
+
+	/* reset ref counts for all cores */
+	for (i = 0; i < chip->cores_num; i++)
+		sdev->dsp_core_ref_count[i] = 0;
+
+	/* TODO: need to reset controller? */
+
+	/* display codec can be powered off after link reset */
+	hda_codec_i915_display_power(sdev, false);
+
+	return 0;
+}
+
+static int mtl_dsp_suspend(struct snd_sof_dev *sdev, u32 target_state)
+{
+	const struct sof_dsp_power_state target_dsp_state = {
+		.state = target_state,
+		.substate = target_state == SOF_DSP_PM_D0 ?
+				SOF_HDA_DSP_PM_D0I3 : 0,
+	};
+	int ret;
+
+	ret = mtl_suspend(sdev, false);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_dsp_state);
+}
+
+static int mtl_dsp_runtime_suspend(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D3,
+	};
+	int ret;
+
+	ret = mtl_suspend(sdev, true);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static int mtl_resume(struct snd_sof_dev *sdev, bool runtime_resume)
+{
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	struct hdac_bus *bus = sof_to_bus(sdev);
+	struct hdac_ext_link *hlink = NULL;
+#endif
+
+	/* display codec must be powered before link reset */
+	hda_codec_i915_display_power(sdev, true);
+
+#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA)
+	/* check jack status */
+	if (runtime_resume) {
+		hda_codec_jack_wake_enable(sdev, false);
+		if (sdev->system_suspend_target == SOF_SUSPEND_NONE)
+			hda_codec_jack_check(sdev);
+	}
+
+	/* turn off the links that were off before suspend */
+	list_for_each_entry(hlink, &bus->hlink_list, list) {
+		if (!hlink->ref_count)
+			snd_hdac_ext_bus_link_power_down(hlink);
+	}
+
+	/* check dma status and clean up CORB/RIRB buffers */
+	if (!bus->cmd_dma_state)
+		snd_hdac_bus_stop_cmd_io(bus);
+#endif
+
+	return 0;
+}
+
+static int mtl_dsp_resume(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D0,
+		.substate = SOF_HDA_DSP_PM_D0I0,
+	};
+	int ret;
+
+	ret = mtl_resume(sdev, false);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static int mtl_dsp_runtime_resume(struct snd_sof_dev *sdev)
+{
+	const struct sof_dsp_power_state target_state = {
+		.state = SOF_DSP_PM_D0,
+	};
+	int ret;
+
+	ret = mtl_resume(sdev, true);
+	if (ret < 0)
+		return ret;
+
+	return snd_sof_dsp_set_power_state(sdev, &target_state);
+}
+
+static void mtl_ipc_dump(struct snd_sof_dev *sdev)
+{
+	u32 hipcctl;
+	u32 hipcida;
+	u32 hipctdr;
+
+	/* read IPC status */
+	hipcida = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXIDA);
+	hipcctl = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXCTL);
+	hipctdr = snd_sof_dsp_read(sdev, HDA_DSP_BAR, MTL_DSP_REG_HFIPCXTDR);
+
+	/* dump the IPC regs */
+	/* TODO: parse the raw msg */
+	dev_err(sdev->dev,
+		"error: host status 0x%8.8x dsp status 0x%8.8x mask 0x%8.8x\n",
+		hipcida, hipctdr, hipcctl);
+}
+
+/* Meteorlake ops */
+struct snd_sof_dsp_ops sof_mtl_ops;
+EXPORT_SYMBOL_NS(sof_mtl_ops, SND_SOC_SOF_INTEL_HDA_COMMON);
+
+int sof_mtl_ops_init(struct snd_sof_dev *sdev)
+{
+	struct sof_ipc4_fw_data *ipc4_data;
+
+	/* common defaults */
+	memcpy(&sof_mtl_ops, &sof_hda_common_ops, sizeof(struct snd_sof_dsp_ops));
+
+	/* shutdown */
+	sof_mtl_ops.shutdown = hda_dsp_shutdown;
+
+	/* doorbell */
+	sof_mtl_ops.irq_thread = mtl_ipc_irq_thread;
+
+	/* ipc */
+	sof_mtl_ops.send_msg = mtl_ipc_send_msg;
+	sof_mtl_ops.get_mailbox_offset = mtl_dsp_ipc_get_mailbox_offset;
+	sof_mtl_ops.get_window_offset = mtl_dsp_ipc_get_window_offset;
+
+	/* debug */
+	sof_mtl_ops.debug_map = mtl_dsp_debugfs;
+	sof_mtl_ops.debug_map_count = ARRAY_SIZE(mtl_dsp_debugfs);
+	sof_mtl_ops.dbg_dump = mtl_dsp_dump;
+	sof_mtl_ops.ipc_dump = mtl_ipc_dump;
+
+	/* pre/post fw run */
+	sof_mtl_ops.pre_fw_run = mtl_dsp_pre_fw_run;
+	sof_mtl_ops.post_fw_run = mtl_dsp_post_fw_run;
+
+	/* parse platform specific extended manifest */
+	sof_mtl_ops.parse_platform_ext_manifest = NULL;
+
+	/* dsp core get/put */
+	/* TODO: add core_get and core_put */
+
+	/* PM */
+	sof_mtl_ops.suspend = mtl_dsp_suspend;
+	sof_mtl_ops.resume = mtl_dsp_resume;
+	sof_mtl_ops.runtime_suspend = mtl_dsp_runtime_suspend;
+	sof_mtl_ops.runtime_resume = mtl_dsp_runtime_resume;
+
+	sdev->private = devm_kzalloc(sdev->dev, sizeof(struct sof_ipc4_fw_data), GFP_KERNEL);
+	if (!sdev->private)
+		return -ENOMEM;
+
+	ipc4_data = sdev->private;
+	ipc4_data->manifest_fw_hdr_offset = SOF_MAN4_FW_HDR_OFFSET;
+
+	/* set DAI ops */
+	hda_set_dai_drv_ops(sdev, &sof_mtl_ops);
+
+	return 0;
+};
+EXPORT_SYMBOL_NS(sof_mtl_ops_init, SND_SOC_SOF_INTEL_HDA_COMMON);
+
+const struct sof_intel_dsp_desc mtl_chip_info = {
+	.cores_num = 3,
+	.init_core_mask = BIT(0),
+	.host_managed_cores_mask = BIT(0),
+	.ipc_req = MTL_DSP_REG_HFIPCXIDR,
+	.ipc_req_mask = MTL_DSP_REG_HFIPCXIDR_BUSY,
+	.ipc_ack = MTL_DSP_REG_HFIPCXIDA,
+	.ipc_ack_mask = MTL_DSP_REG_HFIPCXIDA_DONE,
+	.ipc_ctl = MTL_DSP_REG_HFIPCXCTL,
+	.rom_status_reg = MTL_DSP_ROM_STS,
+	.rom_init_timeout	= 300,
+	.ssp_count = ICL_SSP_COUNT,
+	.ssp_base_offset = CNL_SSP_BASE_OFFSET,
+	.sdw_shim_base = SDW_SHIM_BASE_ACE,
+	.sdw_alh_base = SDW_ALH_BASE_ACE,
+	.check_sdw_irq = mtl_dsp_check_sdw_irq,
+	.check_ipc_irq = mtl_dsp_check_ipc_irq,
+	.cl_init = mtl_dsp_cl_init,
+	.hw_ip_version = SOF_INTEL_ACE_1_0,
+};
+EXPORT_SYMBOL_NS(mtl_chip_info, SND_SOC_SOF_INTEL_HDA_COMMON);
diff --git a/sound/soc/sof/intel/mtl.h b/sound/soc/sof/intel/mtl.h
new file mode 100644
index 000000000000..788bf0e3ea87
--- /dev/null
+++ b/sound/soc/sof/intel/mtl.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Copyright(c) 2020-2022 Intel Corporation. All rights reserved.
+ */
+
+/* DSP Registers */
+#define MTL_HFDSSCS			0x1000
+#define MTL_HFDSSCS_SPA_MASK		BIT(16)
+#define MTL_HFDSSCS_CPA_MASK		BIT(24)
+#define MTL_HFSNDWIE			0x114C
+#define MTL_HFPWRCTL			0x1D18
+#define MTL_HfPWRCTL_WPIOXPG(x)		BIT((x) + 8)
+#define MTL_HFPWRCTL_WPDSPHPXPG		BIT(0)
+#define MTL_HFPWRSTS			0x1D1C
+#define MTL_HFPWRSTS_DSPHPXPGS_MASK	BIT(0)
+#define MTL_HFINTIPPTR			0x1108
+#define MTL_IRQ_INTEN_L_HOST_IPC_MASK	BIT(0)
+#define MTL_IRQ_INTEN_L_SOUNDWIRE_MASK	BIT(6)
+#define MTL_HFINTIPPTR_PTR_MASK		GENMASK(20, 0)
+
+#define MTL_DSP2CXCAP_PRIMARY_CORE	0x178D00
+#define MTL_DSP2CXCTL_PRIMARY_CORE	0x178D04
+#define MTL_DSP2CXCTL_PRIMARY_CORE_SPA_MASK BIT(0)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_CPA_MASK BIT(8)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_OSEL GENMASK(25, 24)
+#define MTL_DSP2CXCTL_PRIMARY_CORE_OSEL_SHIFT 24
+
+/* IPC Registers */
+#define MTL_DSP_REG_HFIPCXTDR		0x73200
+#define MTL_DSP_REG_HFIPCXTDR_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXTDR_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXTDA		0x73204
+#define MTL_DSP_REG_HFIPCXTDA_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDR		0x73210
+#define MTL_DSP_REG_HFIPCXIDR_BUSY	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDR_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXIDA		0x73214
+#define MTL_DSP_REG_HFIPCXIDA_DONE	BIT(31)
+#define MTL_DSP_REG_HFIPCXIDA_MSG_MASK GENMASK(30, 0)
+#define MTL_DSP_REG_HFIPCXCTL		0x73228
+#define MTL_DSP_REG_HFIPCXCTL_BUSY	BIT(0)
+#define MTL_DSP_REG_HFIPCXCTL_DONE	BIT(1)
+#define MTL_DSP_REG_HFIPCXTDDY		0x73300
+#define MTL_DSP_REG_HFIPCXIDDY		0x73380
+#define MTL_DSP_REG_HfHIPCIE		0x1140
+#define MTL_DSP_REG_HfHIPCIE_IE_MASK	BIT(0)
+#define MTL_DSP_REG_HfSNDWIE		0x114C
+#define MTL_DSP_REG_HfSNDWIE_IE_MASK	GENMASK(3, 0)
+
+#define MTL_DSP_IRQSTS			0x20
+#define MTL_DSP_IRQSTS_IPC		BIT(0)
+#define MTL_DSP_IRQSTS_SDW		BIT(6)
+
+#define MTL_DSP_PURGE_TIMEOUT_US	20000000 /* 20s */
+#define MTL_DSP_REG_POLL_INTERVAL_US	10	/* 10 us */
+
+/* Memory windows */
+#define MTL_SRAM_WINDOW_OFFSET(x)	(0x180000 + 0x8000 * (x))
+
+#define MTL_DSP_MBOX_UPLINK_OFFSET	(MTL_SRAM_WINDOW_OFFSET(0) + 0x1000)
+#define MTL_DSP_MBOX_UPLINK_SIZE	0x1000
+#define MTL_DSP_MBOX_DOWNLINK_OFFSET	MTL_SRAM_WINDOW_OFFSET(1)
+#define MTL_DSP_MBOX_DOWNLINK_SIZE	0x1000
+
+/* FW registers */
+#define MTL_DSP_ROM_STS			MTL_SRAM_WINDOW_OFFSET(0) /* ROM status */
+#define MTL_DSP_ROM_ERROR		(MTL_SRAM_WINDOW_OFFSET(0) + 0x4) /* ROM error code */
+
+#define MTL_DSP_REG_HFFLGPXQWY		0x163200 /* ROM debug status */
+#define MTL_DSP_REG_HFFLGPXQWY_ERROR	0x163204 /* ROM debug error code */
+#define MTL_DSP_REG_HfIMRIS1		0x162088
+#define MTL_DSP_REG_HfIMRIS1_IU_MASK	BIT(0)
+
diff --git a/sound/soc/sof/intel/pci-mtl.c b/sound/soc/sof/intel/pci-mtl.c
new file mode 100644
index 000000000000..899b00d53d64
--- /dev/null
+++ b/sound/soc/sof/intel/pci-mtl.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+//
+// This file is provided under a dual BSD/GPLv2 license.  When using or
+// redistributing this file, you may do so under either license.
+//
+// Copyright(c) 2018-2022 Intel Corporation. All rights reserved.
+//
+// Author: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
+//
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <sound/soc-acpi.h>
+#include <sound/soc-acpi-intel-match.h>
+#include <sound/sof.h>
+#include "../ops.h"
+#include "../sof-pci-dev.h"
+
+/* platform specific devices */
+#include "hda.h"
+#include "mtl.h"
+
+static const struct sof_dev_desc mtl_desc = {
+	.use_acpi_target_states	= true,
+	.machines               = snd_soc_acpi_intel_mtl_machines,
+	.alt_machines		= snd_soc_acpi_intel_mtl_sdw_machines,
+	.resindex_lpe_base      = 0,
+	.resindex_pcicfg_base   = -1,
+	.resindex_imr_base      = -1,
+	.irqindex_host_ipc      = -1,
+	.chip_info = &mtl_chip_info,
+	.ipc_supported_mask	= BIT(SOF_INTEL_IPC4),
+	.ipc_default		= SOF_INTEL_IPC4,
+	.default_fw_path = {
+		[SOF_INTEL_IPC4] = "intel/sof-ipc4/mtl",
+	},
+	.default_tplg_path = {
+		[SOF_INTEL_IPC4] = "intel/sof-ace-tplg",
+	},
+	.default_fw_filename = {
+		[SOF_INTEL_IPC4] = "dsp_basefw.bin",
+	},
+	.nocodec_tplg_filename = "sof-mtl-nocodec.tplg",
+	.ops = &sof_mtl_ops,
+	.ops_init = sof_mtl_ops_init,
+};
+
+/* PCI IDs */
+static const struct pci_device_id sof_pci_ids[] = {
+	{ PCI_DEVICE(0x8086, 0x7E28), /* MTL */
+		.driver_data = (unsigned long)&mtl_desc},
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, sof_pci_ids);
+
+/* pci_driver definition */
+static struct pci_driver snd_sof_pci_intel_mtl_driver = {
+	.name = "sof-audio-pci-intel-mtl",
+	.id_table = sof_pci_ids,
+	.probe = hda_pci_intel_probe,
+	.remove = sof_pci_remove,
+	.shutdown = sof_pci_shutdown,
+	.driver = {
+		.pm = &sof_pci_pm,
+	},
+};
+module_pci_driver(snd_sof_pci_intel_mtl_driver);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_IMPORT_NS(SND_SOC_SOF_INTEL_HDA_COMMON);
+MODULE_IMPORT_NS(SND_SOC_SOF_PCI_DEV);
diff --git a/sound/soc/sof/intel/shim.h b/sound/soc/sof/intel/shim.h
index 371991fa474f..638159bee864 100644
--- a/sound/soc/sof/intel/shim.h
+++ b/sound/soc/sof/intel/shim.h
@@ -20,6 +20,7 @@ enum sof_intel_hw_ip_version {
 	SOF_INTEL_CAVS_1_8,	/* CannonLake, CometLake, CoffeeLake */
 	SOF_INTEL_CAVS_2_0,	/* IceLake, JasperLake */
 	SOF_INTEL_CAVS_2_5,	/* TigerLake, AlderLake */
+	SOF_INTEL_ACE_1_0,	/* MeteorLake */
 };
 
 /*
-- 
cgit v1.2.3


From 003cbe046171596809c2f37dc07e69df1b4d9f95 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Date: Wed, 1 Jun 2022 20:58:55 +0530
Subject: pinctrl: Add pingroup and define PINCTRL_PINGROUP

Add 'struct pingroup' to represent pingroup and 'PINCTRL_PINGROUP'
macro for inline use. Both are used to manage and represent
larger number of pingroups.

Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@amd.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220601152900.1012813-2-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 70b45d28e7a9..487117ccb1bc 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -26,6 +26,26 @@ struct pin_config_item;
 struct gpio_chip;
 struct device_node;
 
+/**
+ * struct pingroup - provides information on pingroup
+ * @name: a name for pingroup
+ * @pins: an array of pins in the pingroup
+ * @npins: number of pins in the pingroup
+ */
+struct pingroup {
+	const char *name;
+	const unsigned int *pins;
+	size_t npins;
+};
+
+/* Convenience macro to define a single named or anonymous pingroup */
+#define PINCTRL_PINGROUP(_name, _pins, _npins)	\
+(struct pingroup){				\
+	.name = _name,				\
+	.pins = _pins,				\
+	.npins = _npins,			\
+}
+
 /**
  * struct pinctrl_pin_desc - boards/machines provide information on their
  * pins, pads or other muxable units in this struct
-- 
cgit v1.2.3


From af89cd45603483135bdd238fcb3fa871155a0ae1 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 May 2022 09:57:34 +0200
Subject: clk: Improve documentation for devm_clk_get() and its optional
 variant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make use of "Context:" and "Return:". Mention that the clk is not to be
expected to be prepared, previously only not being enabled was mentioned
which probably dates from the times when the concept of clk preparation
wasn't invented yet.

Also describe devm_clk_get_optional() fully instead of just referencing
devm_clk_get().

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220520075737.758761-2-u.kleine-koenig@pengutronix.de
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 39faa54efe88..c8fc398d2ad7 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -443,15 +443,16 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  * @dev: device for clock "consumer"
  * @id: clock consumer ID
  *
- * Returns a struct clk corresponding to the clock producer, or
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
  * valid IS_ERR() condition containing errno.  The implementation
  * uses @dev and @id to determine the clock consumer, and thereby
  * the clock producer.  (IOW, @id may be identical strings, but
  * clk_get may return different clock producers depending on @dev.)
  *
- * Drivers must assume that the clock source is not enabled.
- *
- * devm_clk_get should not be called from within interrupt context.
+ * Drivers must assume that the clock source is neither prepared nor
+ * enabled.
  *
  * The clock will automatically be freed when the device is unbound
  * from the bus.
@@ -464,8 +465,20 @@ struct clk *devm_clk_get(struct device *dev, const char *id);
  * @dev: device for clock "consumer"
  * @id: clock consumer ID
  *
- * Behaves the same as devm_clk_get() except where there is no clock producer.
- * In this case, instead of returning -ENOENT, the function returns NULL.
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get().
+ *
+ * Drivers must assume that the clock source is neither prepared nor
+ * enabled.
+ *
+ * The clock will automatically be freed when the device is unbound
+ * from the bus.
  */
 struct clk *devm_clk_get_optional(struct device *dev, const char *id);
 
-- 
cgit v1.2.3


From 7ef9651e9792b08eb310c6beb202cbc947f43cab Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 20 May 2022 09:57:36 +0200
Subject: clk: Provide new devm_clk helpers for prepared and enabled clocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a driver keeps a clock prepared (or enabled) during the whole
lifetime of the driver, these helpers allow to simplify the drivers.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <aardelean@deviqon.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20220520075737.758761-4-u.kleine-koenig@pengutronix.de
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c |  27 ++++++++++++
 include/linux/clk.h      | 109 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index c822f4ef1584..43ccd20e0298 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -66,12 +66,39 @@ struct clk *devm_clk_get(struct device *dev, const char *id)
 }
 EXPORT_SYMBOL(devm_clk_get);
 
+struct clk *devm_clk_get_prepared(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get, clk_prepare, clk_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_prepared);
+
+struct clk *devm_clk_get_enabled(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get,
+			      clk_prepare_enable, clk_disable_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_enabled);
+
 struct clk *devm_clk_get_optional(struct device *dev, const char *id)
 {
 	return __devm_clk_get(dev, id, clk_get_optional, NULL, NULL);
 }
 EXPORT_SYMBOL(devm_clk_get_optional);
 
+struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get_optional,
+			      clk_prepare, clk_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_optional_prepared);
+
+struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id)
+{
+	return __devm_clk_get(dev, id, clk_get_optional,
+			      clk_prepare_enable, clk_disable_unprepare);
+}
+EXPORT_SYMBOL_GPL(devm_clk_get_optional_enabled);
+
 struct clk_bulk_devres {
 	struct clk_bulk_data *clks;
 	int num_clks;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index c8fc398d2ad7..c13061cabdfc 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -459,6 +459,47 @@ int __must_check devm_clk_bulk_get_all(struct device *dev,
  */
 struct clk *devm_clk_get(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_prepared - devm_clk_get() + clk_prepare()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  (IOW, @id may be identical strings, but
+ * clk_get may return different clock producers depending on @dev.)
+ *
+ * The returned clk (if valid) is prepared. Drivers must however assume
+ * that the clock is not enabled.
+ *
+ * The clock will automatically be unprepared and freed when the device
+ * is unbound from the bus.
+ */
+struct clk *devm_clk_get_prepared(struct device *dev, const char *id);
+
+/**
+ * devm_clk_get_enabled - devm_clk_get() + clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  (IOW, @id may be identical strings, but
+ * clk_get may return different clock producers depending on @dev.)
+ *
+ * The returned clk (if valid) is prepared and enabled.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_enabled(struct device *dev, const char *id);
+
 /**
  * devm_clk_get_optional - lookup and obtain a managed reference to an optional
  *			   clock producer.
@@ -482,6 +523,50 @@ struct clk *devm_clk_get(struct device *dev, const char *id);
  */
 struct clk *devm_clk_get_optional(struct device *dev, const char *id);
 
+/**
+ * devm_clk_get_optional_prepared - devm_clk_get_optional() + clk_prepare()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get_prepared().
+ *
+ * The returned clk (if valid) is prepared. Drivers must however
+ * assume that the clock is not enabled.
+ *
+ * The clock will automatically be unprepared and freed when the
+ * device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id);
+
+/**
+ * devm_clk_get_optional_enabled - devm_clk_get_optional() +
+ *                                 clk_prepare_enable()
+ * @dev: device for clock "consumer"
+ * @id: clock consumer ID
+ *
+ * Context: May sleep.
+ *
+ * Return: a struct clk corresponding to the clock producer, or
+ * valid IS_ERR() condition containing errno.  The implementation
+ * uses @dev and @id to determine the clock consumer, and thereby
+ * the clock producer.  If no such clk is found, it returns NULL
+ * which serves as a dummy clk.  That's the only difference compared
+ * to devm_clk_get_enabled().
+ *
+ * The returned clk (if valid) is prepared and enabled.
+ *
+ * The clock will automatically be disabled, unprepared and freed
+ * when the device is unbound from the bus.
+ */
+struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id);
+
 /**
  * devm_get_clk_from_child - lookup and obtain a managed reference to a
  *			     clock producer from child node.
@@ -826,12 +911,36 @@ static inline struct clk *devm_clk_get(struct device *dev, const char *id)
 	return NULL;
 }
 
+static inline struct clk *devm_clk_get_prepared(struct device *dev,
+						const char *id)
+{
+	return NULL;
+}
+
+static inline struct clk *devm_clk_get_enabled(struct device *dev,
+					       const char *id)
+{
+	return NULL;
+}
+
 static inline struct clk *devm_clk_get_optional(struct device *dev,
 						const char *id)
 {
 	return NULL;
 }
 
+static inline struct clk *devm_clk_get_optional_prepared(struct device *dev,
+							 const char *id)
+{
+	return NULL;
+}
+
+static inline struct clk *devm_clk_get_optional_enabled(struct device *dev,
+							const char *id)
+{
+	return NULL;
+}
+
 static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 						 struct clk_bulk_data *clks)
 {
-- 
cgit v1.2.3


From 5a0e4529d9aee8ce348f628ad476c9ddb6cf457d Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:52 -0500
Subject: dmaengine: dw-edma: Remove unused irq field in struct dw_edma_chip

The "irq" field of struct dw_edma_chip was never used. Remove it.

Link: https://lore.kernel.org/r/20220524152159.2370739-2-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-pcie.c | 1 -
 include/linux/dma/edma.h           | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index cee7aa231d7b..bc07923c3bc0 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -214,7 +214,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->dw = dw;
 	chip->dev = dev;
 	chip->id = pdev->devfn;
-	chip->irq = pdev->irq;
 
 	dw->mf = vsec_data.mf;
 	dw->nr_irqs = nr_irqs;
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index cab6e18773da..d4333e721588 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -18,13 +18,11 @@ struct dw_edma;
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
- * @irq:		 irq line
  * @dw:			 struct dw_edma that is filed by dw_edma_probe()
  */
 struct dw_edma_chip {
 	struct device		*dev;
 	int			id;
-	int			irq;
 	struct dw_edma		*dw;
 };
 
-- 
cgit v1.2.3


From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 30 May 2022 14:34:50 -0400
Subject: mm: avoid unnecessary page fault retires on shared memory types

I observed that for each of the shared file-backed page faults, we're very
likely to retry one more time for the 1st write fault upon no page.  It's
because we'll need to release the mmap lock for dirty rate limit purpose
with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()).

Then after that throttling we return VM_FAULT_RETRY.

We did that probably because VM_FAULT_RETRY is the only way we can return
to the fault handler at that time telling it we've released the mmap lock.

However that's not ideal because it's very likely the fault does not need
to be retried at all since the pgtable was well installed before the
throttling, so the next continuous fault (including taking mmap read lock,
walk the pgtable, etc.) could be in most cases unnecessary.

It's not only slowing down page faults for shared file-backed, but also add
more mmap lock contention which is in most cases not needed at all.

To observe this, one could try to write to some shmem page and look at
"pgfault" value in /proc/vmstat, then we should expect 2 counts for each
shmem write simply because we retried, and vm event "pgfault" will capture
that.

To make it more efficient, add a new VM_FAULT_COMPLETED return code just to
show that we've completed the whole fault and released the lock.  It's also
a hint that we should very possibly not need another fault immediately on
this page because we've just completed it.

This patch provides a ~12% perf boost on my aarch64 test VM with a simple
program sequentially dirtying 400MB shmem file being mmap()ed and these are
the time it needs:

  Before: 650.980 ms (+-1.94%)
  After:  569.396 ms (+-1.38%)

I believe it could help more than that.

We need some special care on GUP and the s390 pgfault handler (for gmap
code before returning from pgfault), the rest changes in the page fault
handlers should be relatively straightforward.

Another thing to mention is that mm_account_fault() does take this new
fault as a generic fault to be accounted, unlike VM_FAULT_RETRY.

I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do
not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping
them as-is.

Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vineet Gupta <vgupta@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>	[arm part]
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Weinberger <richard@nod.at>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Will Deacon <will@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Helge Deller <deller@gmx.de>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/mm/fault.c         |  4 ++++
 arch/arc/mm/fault.c           |  4 ++++
 arch/arm/mm/fault.c           |  4 ++++
 arch/arm64/mm/fault.c         |  4 ++++
 arch/csky/mm/fault.c          |  4 ++++
 arch/hexagon/mm/vm_fault.c    |  4 ++++
 arch/ia64/mm/fault.c          |  4 ++++
 arch/m68k/mm/fault.c          |  4 ++++
 arch/microblaze/mm/fault.c    |  4 ++++
 arch/mips/mm/fault.c          |  4 ++++
 arch/nios2/mm/fault.c         |  4 ++++
 arch/openrisc/mm/fault.c      |  4 ++++
 arch/parisc/mm/fault.c        |  4 ++++
 arch/powerpc/mm/copro_fault.c |  5 +++++
 arch/powerpc/mm/fault.c       |  5 +++++
 arch/riscv/mm/fault.c         |  4 ++++
 arch/s390/mm/fault.c          | 12 ++++++++++++
 arch/sh/mm/fault.c            |  4 ++++
 arch/sparc/mm/fault_32.c      |  4 ++++
 arch/sparc/mm/fault_64.c      |  5 +++++
 arch/um/kernel/trap.c         |  4 ++++
 arch/x86/mm/fault.c           |  4 ++++
 arch/xtensa/mm/fault.c        |  4 ++++
 include/linux/mm_types.h      |  2 ++
 mm/gup.c                      | 34 +++++++++++++++++++++++++++++++++-
 mm/memory.c                   |  2 +-
 26 files changed, 139 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index ec20c1004abf..ef427a6bdd1a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -155,6 +155,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index dad27e4d69ff..5ca59a482632 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -146,6 +146,10 @@ retry:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/*
 	 * Fault retry nuances, mmap_lock already relinquished by core mm
 	 */
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index a062e07516dd..46cccd6bf705 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -322,6 +322,10 @@ retry:
 		return 0;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (!(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_RETRY) {
 			flags |= FAULT_FLAG_TRIED;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c5e11768e5c1..de166cdeb89a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -608,6 +608,10 @@ retry:
 		return 0;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (fault & VM_FAULT_RETRY) {
 		mm_flags |= FAULT_FLAG_TRIED;
 		goto retry;
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c
index 7215a46b6b8e..e15f736cca4b 100644
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -285,6 +285,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 4fac4b9eb316..f73c7cbfe326 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -96,6 +96,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/* The most common case -- we are done. */
 	if (likely(!(fault & VM_FAULT_ERROR))) {
 		if (fault & VM_FAULT_RETRY) {
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 07379d1a227f..ef78c2d66cdd 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -139,6 +139,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We ran out of memory, or some other thing happened
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 71aa9f6315dc..4d2837eb3e2a 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -141,6 +141,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return 0;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index a9626e6a68af..5c40c3ebe52f 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -222,6 +222,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index b08bc556d30d..a27045f5a556 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -162,6 +162,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index a32f14cd72f2..edaca0a6c1c1 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -139,6 +139,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 53b760af3bb7..b4762d66e9ef 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -165,6 +165,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 84bc437be5cd..9ad80d4d3389 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -311,6 +311,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		/*
 		 * We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index c1cb21a00884..7c507fb48182 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 
 	ret = 0;
 	*flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (*flt & VM_FAULT_COMPLETED)
+		return 0;
+
 	if (unlikely(*flt & VM_FAULT_ERROR)) {
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d53fed4eccbd..014005428687 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -511,6 +511,10 @@ retry:
 	if (fault_signal_pending(fault, regs))
 		return user_mode(regs) ? 0 : SIGBUS;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		goto out;
+
 	/*
 	 * Handle the retry right now, the mmap_lock has been released in that
 	 * case.
@@ -525,6 +529,7 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR))
 		return mm_fault_error(regs, address, fault);
 
+out:
 	/*
 	 * Major/minor page fault accounting.
 	 */
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 40694f0cab9e..f2fbd1400b7c 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -326,6 +326,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_RETRY)) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e173b6187ad5..973dcd05c293 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -433,6 +433,17 @@ retry:
 			goto out_up;
 		goto out;
 	}
+
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED) {
+		if (gmap) {
+			mmap_read_lock(mm);
+			goto out_gmap;
+		}
+		fault = 0;
+		goto out;
+	}
+
 	if (unlikely(fault & VM_FAULT_ERROR))
 		goto out_up;
 
@@ -452,6 +463,7 @@ retry:
 		mmap_read_lock(mm);
 		goto retry;
 	}
+out_gmap:
 	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
 		address =  __gmap_link(gmap, current->thread.gmap_addr,
 				       address);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index e175667b1363..acd2f5e50bfc 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -485,6 +485,10 @@ good_area:
 		if (mm_fault_error(regs, error_code, address, fault))
 			return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (fault & VM_FAULT_RETRY) {
 		flags |= FAULT_FLAG_TRIED;
 
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index ad569d9bd124..91259f291c54 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -190,6 +190,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		return;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 253e07043298..4acc12eafbf5 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -427,6 +427,10 @@ good_area:
 	if (fault_signal_pending(fault, regs))
 		goto exit_exception;
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		goto lock_released;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -449,6 +453,7 @@ good_area:
 	}
 	mmap_read_unlock(mm);
 
+lock_released:
 	mm_rss = get_mm_rss(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 	mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index d1d5d0be0308..d3ce21c4ca32 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,6 +76,10 @@ good_area:
 		if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
 			goto out_nosemaphore;
 
+		/* The fault is fully completed (including releasing mmap lock) */
+		if (fault & VM_FAULT_COMPLETED)
+			return 0;
+
 		if (unlikely(fault & VM_FAULT_ERROR)) {
 			if (fault & VM_FAULT_OOM) {
 				goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fad8faa29d04..fe10c6d76bac 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1408,6 +1408,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	/*
 	 * If we need to retry the mmap_lock has already been released,
 	 * and if there is a fatal signal pending there is no guarantee
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 16f0a5ff5799..8c781b05c0bd 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -172,6 +172,10 @@ good_area:
 		return;
 	}
 
+	/* The fault is fully completed (including releasing mmap lock) */
+	if (fault & VM_FAULT_COMPLETED)
+		return;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c29ab4c0cd5c..6b961a29bf26 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t;
  * @VM_FAULT_NEEDDSYNC:		->fault did not modify page tables and needs
  *				fsync() to complete (for synchronous page faults
  *				in DAX)
+ * @VM_FAULT_COMPLETED:		->fault completed, meanwhile mmap lock released
  * @VM_FAULT_HINDEX_MASK:	mask HINDEX value
  *
  */
@@ -746,6 +747,7 @@ enum vm_fault_reason {
 	VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
 	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
 	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
+	VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
 	VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
 };
 
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..407a81d5ca03 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -951,6 +951,25 @@ static int faultin_page(struct vm_area_struct *vma,
 	}
 
 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+	if (ret & VM_FAULT_COMPLETED) {
+		/*
+		 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+		 * mmap lock in the page fault handler. Sanity check this.
+		 */
+		WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+		if (locked)
+			*locked = 0;
+		/*
+		 * We should do the same as VM_FAULT_RETRY, but let's not
+		 * return -EBUSY since that's not reflecting the reality of
+		 * what has happened - we've just fully completed a page
+		 * fault, with the mmap lock released.  Use -EAGAIN to show
+		 * that we want to take the mmap lock _again_.
+		 */
+		return -EAGAIN;
+	}
+
 	if (ret & VM_FAULT_ERROR) {
 		int err = vm_fault_to_errno(ret, *flags);
 
@@ -1177,6 +1196,7 @@ retry:
 			case 0:
 				goto retry;
 			case -EBUSY:
+			case -EAGAIN:
 				ret = 0;
 				fallthrough;
 			case -EFAULT:
@@ -1303,6 +1323,18 @@ retry:
 		return -EINTR;
 
 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+	if (ret & VM_FAULT_COMPLETED) {
+		/*
+		 * NOTE: it's a pity that we need to retake the lock here
+		 * to pair with the unlock() in the callers. Ideally we
+		 * could tell the callers so they do not need to unlock.
+		 */
+		mmap_read_lock(mm);
+		*unlocked = true;
+		return 0;
+	}
+
 	if (ret & VM_FAULT_ERROR) {
 		int err = vm_fault_to_errno(ret, 0);
 
@@ -1368,7 +1400,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 			/* VM_FAULT_RETRY couldn't trigger, bypass */
 			return ret;
 
-		/* VM_FAULT_RETRY cannot return errors */
+		/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
 		if (!*locked) {
 			BUG_ON(ret < 0);
 			BUG_ON(ret >= nr_pages);
diff --git a/mm/memory.c b/mm/memory.c
index 7a089145cad4..580c62febe42 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3020,7 +3020,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
 		balance_dirty_pages_ratelimited(mapping);
 		if (fpin) {
 			fput(fpin);
-			return VM_FAULT_RETRY;
+			return VM_FAULT_COMPLETED;
 		}
 	}
 
-- 
cgit v1.2.3


From bcc728eb4f446073e0160671d7d0059a4e9aa300 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Tue, 31 May 2022 10:04:21 +0800
Subject: mm/damon: remove obsolete comments of kdamond_stop

Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete
kdamond_stop and change to use kthread stop mechanism, these obsolete
comments should be removed accordingly.

Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7c62da31ce4b..2765c7d99beb 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -397,7 +397,6 @@ struct damon_callback {
  * detail.
  *
  * @kdamond:		Kernel thread who does the monitoring.
- * @kdamond_stop:	Notifies whether kdamond should stop.
  * @kdamond_lock:	Mutex for the synchronizations with @kdamond.
  *
  * For each monitoring context, one kernel thread for the monitoring is
@@ -406,14 +405,14 @@ struct damon_callback {
  * Once started, the monitoring thread runs until explicitly required to be
  * terminated or every monitoring target is invalid.  The validity of the
  * targets is checked via the &damon_operations.target_valid of @ops.  The
- * termination can also be explicitly requested by writing non-zero to
- * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
- * Therefore, users can know whether the monitoring is ongoing or terminated by
- * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
- * outside of the monitoring thread must be protected by @kdamond_lock.
- *
- * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
- * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ * termination can also be explicitly requested by calling damon_stop().
+ * The thread sets @kdamond to NULL when it terminates. Therefore, users can
+ * know whether the monitoring is ongoing or terminated by reading @kdamond.
+ * Reads and writes to @kdamond from outside of the monitoring thread must
+ * be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
+ * Accesses to other fields must be protected by themselves.
  *
  * @ops:	Set of monitoring operations for given use cases.
  * @callback:	Set of callbacks for monitoring events notifications.
-- 
cgit v1.2.3


From 9384d79249d04b03572abb7e551a35d99c9268c0 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Mon, 6 Jun 2022 16:15:33 +0200
Subject: mm/highmem: delete memmove_page()

Matthew Wilcox reported that, while he was looking at memmove_page(), he
realized that it can't actually work.

The reasons are hidden in its implementation, which makes use of memmove()
on logical addresses provided by kmap_local_page().  memmove() does the
wrong thing when it tests "if (dest <= src)".

Therefore, delete memmove_page().

No need to change any other code because we have no call sites of
memmove_page() across the whole kernel.

Link: https://lkml.kernel.org/r/20220606141533.555-1-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Reported-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3af34de54330..fee9835e3793 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off,
 	kunmap_local(dst);
 }
 
-static inline void memmove_page(struct page *dst_page, size_t dst_off,
-			       struct page *src_page, size_t src_off,
-			       size_t len)
-{
-	char *dst = kmap_local_page(dst_page);
-	char *src = kmap_local_page(src_page);
-
-	VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
-	memmove(dst + dst_off, src + src_off, len);
-	kunmap_local(src);
-	kunmap_local(dst);
-}
-
 static inline void memset_page(struct page *page, size_t offset, int val,
 			       size_t len)
 {
-- 
cgit v1.2.3


From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001
From: Patrick Wang <patrick.wang.shcn@gmail.com>
Date: Sat, 11 Jun 2022 11:55:48 +0800
Subject: mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count
 argument to kmemleak_alloc_phys()

Patch series "mm: kmemleak: store objects allocated with physical address
separately and check when scan", v4.

The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to
check address.  But on some architectures, kmemleak_*_phys() is called
before those two variables initialized.  The following steps will be
taken:

1) Add OBJECT_PHYS flag and rbtree for the objects allocated
   with physical address
2) Store physical address in objects if allocated with OBJECT_PHYS
3) Check the boundary when scan instead of in kmemleak_*_phys()

This patch set will solve:
https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com
https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com

v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com
v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com
v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com


This patch (of 4):

Remove the unused kmemleak_not_leak_phys() function.  And remove the
min_count argument to kmemleak_alloc_phys() function, assume it's 0.

Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com
Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com
Signed-off-by: Patrick Wang <patrick.wang.shcn@gmail.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Yee Lee <yee.lee@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/dev-tools/kmemleak.rst    |  1 -
 drivers/of/fdt.c                        |  2 +-
 include/linux/kmemleak.h                |  8 ++------
 mm/kmemleak.c                           | 20 +++-----------------
 mm/memblock.c                           | 14 +++++++-------
 tools/testing/memblock/linux/kmemleak.h |  2 +-
 6 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 1c935f41cd3a..5483fd39ef29 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -174,7 +174,6 @@ mapping:
 
 - ``kmemleak_alloc_phys``
 - ``kmemleak_free_part_phys``
-- ``kmemleak_not_leak_phys``
 - ``kmemleak_ignore_phys``
 
 Dealing with false positives/negatives
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index a8f5b6532165..2c677e84c3f5 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
 			pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
 				uname, &base, (unsigned long)(size / SZ_1M));
 			if (!nomap)
-				kmemleak_alloc_phys(base, size, 0, 0);
+				kmemleak_alloc_phys(base, size, 0);
 		}
 		else
 			pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 34684b2026ab..6a3cd1bf4680 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
 extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
-extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
 				gfp_t gfp) __ref;
 extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
-extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
 extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
 
 static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
@@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr)
 {
 }
 static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-				       int min_count, gfp_t gfp)
+				       gfp_t gfp)
 {
 }
 static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 {
 }
-static inline void kmemleak_not_leak_phys(phys_addr_t phys)
-{
-}
 static inline void kmemleak_ignore_phys(phys_addr_t phys)
 {
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a182f5ddaf68..156eafafa182 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1125,15 +1125,13 @@ EXPORT_SYMBOL(kmemleak_no_scan);
  *			 address argument
  * @phys:	physical address of the object
  * @size:	size of the object
- * @min_count:	minimum number of references to this object.
- *              See kmemleak_alloc()
  * @gfp:	kmalloc() flags used for kmemleak internal memory allocations
  */
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
-			       gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
 {
 	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-		kmemleak_alloc(__va(phys), size, min_count, gfp);
+		/* assume min_count 0 */
+		kmemleak_alloc(__va(phys), size, 0, gfp);
 }
 EXPORT_SYMBOL(kmemleak_alloc_phys);
 
@@ -1151,18 +1149,6 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 }
 EXPORT_SYMBOL(kmemleak_free_part_phys);
 
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- *			    address argument
- * @phys:	physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
-	if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-		kmemleak_not_leak(__va(phys));
-}
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
-
 /**
  * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
  *			  address argument
diff --git a/mm/memblock.c b/mm/memblock.c
index e4f03a6e8e56..749abd2685c4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
  * from the regions with mirroring enabled and then retried from any
  * memory region.
  *
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
  *
  * Return:
  * Physical address of allocated memory block on success, %0 on failure.
@@ -1398,12 +1398,12 @@ done:
 	 */
 	if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
 		/*
-		 * The min_count is set to 0 so that memblock allocated
-		 * blocks are never reported as leaks. This is because many
-		 * of these blocks are only referred via the physical
-		 * address which is not looked up by kmemleak.
+		 * Memblock allocated blocks are never reported as
+		 * leaks. This is because many of these blocks are
+		 * only referred via the physical address which is
+		 * not looked up by kmemleak.
 		 */
-		kmemleak_alloc_phys(found, size, 0, 0);
+		kmemleak_alloc_phys(found, size, 0);
 
 	return found;
 }
diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h
index 462f8c5e8aa0..5fed13bb9ec4 100644
--- a/tools/testing/memblock/linux/kmemleak.h
+++ b/tools/testing/memblock/linux/kmemleak.h
@@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
 }
 
 static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-				       int min_count, gfp_t gfp)
+				       gfp_t gfp)
 {
 }
 
-- 
cgit v1.2.3


From fc4db90fe71e640e3fe88df346f7cf653b75315d Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Fri, 10 Jun 2022 11:03:10 -0700
Subject: mm: kmem: make mem_cgroup_from_obj() vmalloc()-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently mem_cgroup_from_obj() is not working properly with objects
allocated using vmalloc().  It creates problems in some cases, when it's
called for static objects belonging to modules or generally allocated
using vmalloc().

This patch makes mem_cgroup_from_obj() safe to be called on objects
allocated using vmalloc().

It also introduces mem_cgroup_from_slab_obj(), which is a faster version
to use in places when we know the object is either a slab object or a
generic slab page (e.g.  when adding an object to a lru list).

Link: https://lkml.kernel.org/r/20220610180310.1725111-1-roman.gushchin@linux.dev
Suggested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Acked-by: Shakeel Butt <shakeelb@google.com>
Tested-by: Vasily Averin <vvs@openvz.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Qian Cai <quic_qiancai@quicinc.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++++
 mm/list_lru.c              |  2 +-
 mm/memcontrol.c            | 71 ++++++++++++++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9ecead1042b9..3ce96ce5fe3e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 }
 
 struct mem_cgroup *mem_cgroup_from_obj(void *p);
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
 
 static inline void count_objcg_event(struct obj_cgroup *objcg,
 				     enum vm_event_item idx)
@@ -1801,6 +1802,11 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
        return NULL;
 }
 
+static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+	return NULL;
+}
+
 static inline void count_objcg_event(struct obj_cgroup *objcg,
 				     enum vm_event_item idx)
 {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index ba76428ceece..a05e5bef3b40 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
 	if (!list_lru_memcg_aware(lru))
 		goto out;
 
-	memcg = mem_cgroup_from_obj(ptr);
+	memcg = mem_cgroup_from_slab_obj(ptr);
 	if (!memcg)
 		goto out;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28c1532cc91f..c1ae9b3f8d35 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -783,7 +783,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 	struct lruvec *lruvec;
 
 	rcu_read_lock();
-	memcg = mem_cgroup_from_obj(p);
+	memcg = mem_cgroup_from_slab_obj(p);
 
 	/*
 	 * Untracked pages have no memcg, no lruvec. Update only the
@@ -2841,27 +2841,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
 	return 0;
 }
 
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
 {
-	struct folio *folio;
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	folio = virt_to_folio(p);
-
 	/*
 	 * Slab objects are accounted individually, not per-page.
 	 * Memcg membership data for each individual object is saved in
@@ -2894,6 +2876,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 	return page_memcg_check(folio_page(folio, 0));
 }
 
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+	struct folio *folio;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	if (unlikely(is_vmalloc_addr(p)))
+		folio = page_folio(vmalloc_to_page(p));
+	else
+		folio = virt_to_folio(p);
+
+	return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
 {
 	struct obj_cgroup *objcg = NULL;
-- 
cgit v1.2.3


From 1d0403d20f6c281cb3d14c5f1db5317caeec48e9 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@openvz.org>
Date: Fri, 3 Jun 2022 07:19:43 +0300
Subject: net: set proper memcg for net_init hooks allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__register_pernet_operations() executes init hook of registered
pernet_operation structure in all existing net namespaces.

Typically, these hooks are called by a process associated with the
specified net namespace, and all __GFP_ACCOUNT marked allocation are
accounted for corresponding container/memcg.

However __register_pernet_operations() calls the hooks in the same
context, and as a result all marked allocations are accounted to one memcg
for all processed net namespaces.

This patch adjusts active memcg for each net namespace and helps to
account memory allocated inside ops_init() into the proper memcg.

Link: https://lkml.kernel.org/r/f9394752-e272-9bf9-645f-a18c56d1c4ec@openvz.org
Signed-off-by: Vasily Averin <vvs@openvz.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Linux Kernel Functional Testing <lkft@linaro.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Qian Cai <quic_qiancai@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net_namespace.c   |  7 +++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3ce96ce5fe3e..04f2f33607e9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1756,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
+/**
+ * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object.
+ * @p: pointer to object from which memcg should be extracted. It can be NULL.
+ *
+ * Retrieves the memory group into which the memory of the pointed kernel
+ * object is accounted. If memcg is found, its reference is taken.
+ * If a passed kernel object is uncharged, or if proper memcg cannot be found,
+ * as well as if mem_cgroup is disabled, NULL is returned.
+ *
+ * Return: valid memcg pointer with taken reference or NULL.
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	do {
+		memcg = mem_cgroup_from_obj(p);
+	} while (memcg && !css_tryget(&memcg->css));
+	rcu_read_unlock();
+	return memcg;
+}
+
+/**
+ * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup.
+ * @memcg: pointer to a valid memory cgroup or NULL.
+ *
+ * If passed argument is not NULL, returns it without any additional checks
+ * and changes. Otherwise, root_mem_cgroup is returned.
+ *
+ * NOTE: root_mem_cgroup can be NULL during early boot.
+ */
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+	return memcg ? memcg : root_mem_cgroup;
+}
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1799,7 +1835,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
 
 static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
 {
-       return NULL;
+	return NULL;
 }
 
 static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
@@ -1812,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
 {
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+	return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27..6b9f19122ec1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -18,6 +18,7 @@
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/uidgid.h>
 #include <linux/cookie.h>
 
@@ -1143,7 +1144,13 @@ static int __register_pernet_operations(struct list_head *list,
 		 * setup_net() and cleanup_net() are not possible.
 		 */
 		for_each_net(net) {
+			struct mem_cgroup *old, *memcg;
+
+			memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net));
+			old = set_active_memcg(memcg);
 			error = ops_init(ops, net);
+			set_active_memcg(old);
+			mem_cgroup_put(memcg);
 			if (error)
 				goto out_undo;
 			list_add_tail(&net->exit_list, &net_exit_list);
-- 
cgit v1.2.3


From 4815a36009044ba69a9b8d781943ec6505c451a2 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 3 Jun 2022 20:10:12 +0300
Subject: include/linux/rbtree.h: replace kernel.h with the necessary
 inclusions

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Link: https://lkml.kernel.org/r/20220603171012.48880-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 235047d7a1b5..f7edca369eda 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -17,9 +17,9 @@
 #ifndef	_LINUX_RBTREE_H
 #define	_LINUX_RBTREE_H
 
+#include <linux/container_of.h>
 #include <linux/rbtree_types.h>
 
-#include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/rcupdate.h>
 
-- 
cgit v1.2.3


From dabba87229411a5e9d20ac03ffc36463c53ae672 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Fri, 27 May 2022 02:55:34 +0000
Subject: fs/kernel_read_file: allow to read files up-to ssize_t

Patch series "Allow to kexec with initramfs larger than 2G", v2.

Currently, the largest initramfs that is supported by kexec_file_load()
syscall is 2G.

This is because kernel_read_file() returns int, and is limited to INT_MAX
or 2G.

On the other hand, there are kexec based boot loaders (i.e.  u-root), that
may need to boot netboot images that might be larger than 2G.

The first patch changes the return type from int to ssize_t in
kernel_read_file* functions.

The second patch increases the maximum initramfs file size to 4G.

Tested: verified that can kexec_file_load() works with 4G initramfs
on x86_64.


This patch (of 2):

Currently, the maximum file size that is supported is 2G.  This may be too
small in some cases.  For example, kexec_file_load() system call loads
initramfs.  In some netboot cases initramfs can be rather large.

Allow to use up-to ssize_t bytes.  The callers still can limit the maximum
file size via buf_size.

Link: https://lkml.kernel.org/r/20220527025535.3953665-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20220527025535.3953665-2-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/kernel_read_file.c            | 38 +++++++++++++++++++-------------------
 include/linux/kernel_read_file.h | 32 ++++++++++++++++----------------
 include/linux/limits.h           |  1 +
 3 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b9..5d826274570c 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
  * change between calls to kernel_read_file().
  *
  * Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
  *
  */
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
-		     size_t buf_size, size_t *file_size,
-		     enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+			 size_t buf_size, size_t *file_size,
+			 enum kernel_read_file_id id)
 {
 	loff_t i_size, pos;
-	size_t copied;
+	ssize_t copied;
 	void *allocated = NULL;
 	bool whole_file;
 	int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
 		goto out;
 	}
 	/* The file is too big for sane activities. */
-	if (i_size > INT_MAX) {
+	if (i_size > SSIZE_MAX) {
 		ret = -EFBIG;
 		goto out;
 	}
@@ -124,12 +124,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
-			       size_t buf_size, size_t *file_size,
-			       enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+				   size_t buf_size, size_t *file_size,
+				   enum kernel_read_file_id id)
 {
 	struct file *file;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
 
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id)
 {
 	struct file *file;
 	struct path root;
-	int ret;
+	ssize_t ret;
 
 	if (!path || !*path)
 		return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
 }
 EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
 
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
-			     size_t buf_size, size_t *file_size,
-			     enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+				 size_t buf_size, size_t *file_size,
+				 enum kernel_read_file_id id)
 {
 	struct fd f = fdget(fd);
-	int ret = -EBADF;
+	ssize_t ret = -EBADF;
 
 	if (!f.file || !(f.file->f_mode & FMODE_READ))
 		goto out;
diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h
index 575ffa1031d3..90451e2e12bd 100644
--- a/include/linux/kernel_read_file.h
+++ b/include/linux/kernel_read_file.h
@@ -35,21 +35,21 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
-int kernel_read_file(struct file *file, loff_t offset,
-		     void **buf, size_t buf_size,
-		     size_t *file_size,
-		     enum kernel_read_file_id id);
-int kernel_read_file_from_path(const char *path, loff_t offset,
-			       void **buf, size_t buf_size,
-			       size_t *file_size,
-			       enum kernel_read_file_id id);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
-				      void **buf, size_t buf_size,
-				      size_t *file_size,
-				      enum kernel_read_file_id id);
-int kernel_read_file_from_fd(int fd, loff_t offset,
-			     void **buf, size_t buf_size,
-			     size_t *file_size,
-			     enum kernel_read_file_id id);
+ssize_t kernel_read_file(struct file *file, loff_t offset,
+			 void **buf, size_t buf_size,
+			 size_t *file_size,
+			 enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset,
+				   void **buf, size_t buf_size,
+				   size_t *file_size,
+				   enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+					  void **buf, size_t buf_size,
+					  size_t *file_size,
+					  enum kernel_read_file_id id);
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset,
+				 void **buf, size_t buf_size,
+				 size_t *file_size,
+				 enum kernel_read_file_id id);
 
 #endif /* _LINUX_KERNEL_READ_FILE_H */
diff --git a/include/linux/limits.h b/include/linux/limits.h
index b568b9c30bbf..f6bcc9369010 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -7,6 +7,7 @@
 #include <vdso/limits.h>
 
 #define SIZE_MAX	(~(size_t)0)
+#define SSIZE_MAX	((ssize_t)(SIZE_MAX >> 1))
 #define PHYS_ADDR_MAX	(~(phys_addr_t)0)
 
 #define U8_MAX		((u8)~0U)
-- 
cgit v1.2.3


From e244a46a529a9a4c43ae3a2b4bf613e260ec8f81 Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Tue, 14 Jun 2022 21:41:17 +0200
Subject: platform/surface: aggregator: Reserve more event- and
 target-categories

With the introduction of the Surface Laptop Studio, more event- and
target categories have been added. Therefore, increase the number of
reserved events and extend the enum of know target categories to
accommodate this.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220614194117.4118897-1-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/trace.h   | 80 +++++++++++++++------------
 include/linux/surface_aggregator/serial_hub.h | 75 +++++++++++++------------
 2 files changed, 85 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/trace.h b/drivers/platform/surface/aggregator/trace.h
index de64cf169060..cc9e73fbc18e 100644
--- a/drivers/platform/surface/aggregator/trace.h
+++ b/drivers/platform/surface/aggregator/trace.h
@@ -76,7 +76,7 @@ TRACE_DEFINE_ENUM(SSAM_SSH_TC_HID);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_TCH);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_BKL);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_TAM);
-TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC0);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_UFI);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_USC);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_PEN);
@@ -85,6 +85,11 @@ TRACE_DEFINE_ENUM(SSAM_SSH_TC_AUD);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_SMC);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_KPD);
 TRACE_DEFINE_ENUM(SSAM_SSH_TC_REG);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SPT);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SYS);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_ACC1);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_SHB);
+TRACE_DEFINE_ENUM(SSAM_SSH_TC_POS);
 
 #define SSAM_PTR_UID_LEN		9
 #define SSAM_U8_FIELD_NOT_APPLICABLE	((u16)-1)
@@ -229,40 +234,45 @@ static inline u32 ssam_trace_get_request_tc(const struct ssh_packet *p)
 
 #define ssam_show_ssh_tc(rqid)						\
 	__print_symbolic(rqid,						\
-		{ SSAM_SSH_TC_NOT_APPLICABLE,		"N/A" },	\
-		{ SSAM_SSH_TC_SAM,			"SAM" },	\
-		{ SSAM_SSH_TC_BAT,			"BAT" },	\
-		{ SSAM_SSH_TC_TMP,			"TMP" },	\
-		{ SSAM_SSH_TC_PMC,			"PMC" },	\
-		{ SSAM_SSH_TC_FAN,			"FAN" },	\
-		{ SSAM_SSH_TC_PoM,			"PoM" },	\
-		{ SSAM_SSH_TC_DBG,			"DBG" },	\
-		{ SSAM_SSH_TC_KBD,			"KBD" },	\
-		{ SSAM_SSH_TC_FWU,			"FWU" },	\
-		{ SSAM_SSH_TC_UNI,			"UNI" },	\
-		{ SSAM_SSH_TC_LPC,			"LPC" },	\
-		{ SSAM_SSH_TC_TCL,			"TCL" },	\
-		{ SSAM_SSH_TC_SFL,			"SFL" },	\
-		{ SSAM_SSH_TC_KIP,			"KIP" },	\
-		{ SSAM_SSH_TC_EXT,			"EXT" },	\
-		{ SSAM_SSH_TC_BLD,			"BLD" },	\
-		{ SSAM_SSH_TC_BAS,			"BAS" },	\
-		{ SSAM_SSH_TC_SEN,			"SEN" },	\
-		{ SSAM_SSH_TC_SRQ,			"SRQ" },	\
-		{ SSAM_SSH_TC_MCU,			"MCU" },	\
-		{ SSAM_SSH_TC_HID,			"HID" },	\
-		{ SSAM_SSH_TC_TCH,			"TCH" },	\
-		{ SSAM_SSH_TC_BKL,			"BKL" },	\
-		{ SSAM_SSH_TC_TAM,			"TAM" },	\
-		{ SSAM_SSH_TC_ACC,			"ACC" },	\
-		{ SSAM_SSH_TC_UFI,			"UFI" },	\
-		{ SSAM_SSH_TC_USC,			"USC" },	\
-		{ SSAM_SSH_TC_PEN,			"PEN" },	\
-		{ SSAM_SSH_TC_VID,			"VID" },	\
-		{ SSAM_SSH_TC_AUD,			"AUD" },	\
-		{ SSAM_SSH_TC_SMC,			"SMC" },	\
-		{ SSAM_SSH_TC_KPD,			"KPD" },	\
-		{ SSAM_SSH_TC_REG,			"REG" }		\
+		{ SSAM_SSH_TC_NOT_APPLICABLE,		"N/A"  },	\
+		{ SSAM_SSH_TC_SAM,			"SAM"  },	\
+		{ SSAM_SSH_TC_BAT,			"BAT"  },	\
+		{ SSAM_SSH_TC_TMP,			"TMP"  },	\
+		{ SSAM_SSH_TC_PMC,			"PMC"  },	\
+		{ SSAM_SSH_TC_FAN,			"FAN"  },	\
+		{ SSAM_SSH_TC_PoM,			"PoM"  },	\
+		{ SSAM_SSH_TC_DBG,			"DBG"  },	\
+		{ SSAM_SSH_TC_KBD,			"KBD"  },	\
+		{ SSAM_SSH_TC_FWU,			"FWU"  },	\
+		{ SSAM_SSH_TC_UNI,			"UNI"  },	\
+		{ SSAM_SSH_TC_LPC,			"LPC"  },	\
+		{ SSAM_SSH_TC_TCL,			"TCL"  },	\
+		{ SSAM_SSH_TC_SFL,			"SFL"  },	\
+		{ SSAM_SSH_TC_KIP,			"KIP"  },	\
+		{ SSAM_SSH_TC_EXT,			"EXT"  },	\
+		{ SSAM_SSH_TC_BLD,			"BLD"  },	\
+		{ SSAM_SSH_TC_BAS,			"BAS"  },	\
+		{ SSAM_SSH_TC_SEN,			"SEN"  },	\
+		{ SSAM_SSH_TC_SRQ,			"SRQ"  },	\
+		{ SSAM_SSH_TC_MCU,			"MCU"  },	\
+		{ SSAM_SSH_TC_HID,			"HID"  },	\
+		{ SSAM_SSH_TC_TCH,			"TCH"  },	\
+		{ SSAM_SSH_TC_BKL,			"BKL"  },	\
+		{ SSAM_SSH_TC_TAM,			"TAM"  },	\
+		{ SSAM_SSH_TC_ACC0,			"ACC0" },	\
+		{ SSAM_SSH_TC_UFI,			"UFI"  },	\
+		{ SSAM_SSH_TC_USC,			"USC"  },	\
+		{ SSAM_SSH_TC_PEN,			"PEN"  },	\
+		{ SSAM_SSH_TC_VID,			"VID"  },	\
+		{ SSAM_SSH_TC_AUD,			"AUD"  },	\
+		{ SSAM_SSH_TC_SMC,			"SMC"  },	\
+		{ SSAM_SSH_TC_KPD,			"KPD"  },	\
+		{ SSAM_SSH_TC_REG,			"REG"  },	\
+		{ SSAM_SSH_TC_SPT,			"SPT"  },	\
+		{ SSAM_SSH_TC_SYS,			"SYS"  },	\
+		{ SSAM_SSH_TC_ACC1,			"ACC1" },	\
+		{ SSAM_SSH_TC_SHB,			"SMB"  },	\
+		{ SSAM_SSH_TC_POS,			"POS"  }	\
 	)
 
 DECLARE_EVENT_CLASS(ssam_frame_class,
diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h
index 26b95ec12733..45501b6e54e8 100644
--- a/include/linux/surface_aggregator/serial_hub.h
+++ b/include/linux/surface_aggregator/serial_hub.h
@@ -201,7 +201,7 @@ static inline u16 ssh_crc(const u8 *buf, size_t len)
  * exception of zero, which is not an event ID. Thus, this is also the
  * absolute maximum number of event handlers that can be registered.
  */
-#define SSH_NUM_EVENTS		34
+#define SSH_NUM_EVENTS		38
 
 /*
  * SSH_NUM_TARGETS - The number of communication targets used in the protocol.
@@ -292,40 +292,45 @@ struct ssam_span {
  * Windows driver.
  */
 enum ssam_ssh_tc {
-				/* Category 0x00 is invalid for EC use. */
-	SSAM_SSH_TC_SAM = 0x01,	/* Generic system functionality, real-time clock. */
-	SSAM_SSH_TC_BAT = 0x02,	/* Battery/power subsystem. */
-	SSAM_SSH_TC_TMP = 0x03,	/* Thermal subsystem. */
-	SSAM_SSH_TC_PMC = 0x04,
-	SSAM_SSH_TC_FAN = 0x05,
-	SSAM_SSH_TC_PoM = 0x06,
-	SSAM_SSH_TC_DBG = 0x07,
-	SSAM_SSH_TC_KBD = 0x08,	/* Legacy keyboard (Laptop 1/2). */
-	SSAM_SSH_TC_FWU = 0x09,
-	SSAM_SSH_TC_UNI = 0x0a,
-	SSAM_SSH_TC_LPC = 0x0b,
-	SSAM_SSH_TC_TCL = 0x0c,
-	SSAM_SSH_TC_SFL = 0x0d,
-	SSAM_SSH_TC_KIP = 0x0e,	/* Manages detachable peripherals (Pro X/8 keyboard cover) */
-	SSAM_SSH_TC_EXT = 0x0f,
-	SSAM_SSH_TC_BLD = 0x10,
-	SSAM_SSH_TC_BAS = 0x11,	/* Detachment system (Surface Book 2/3). */
-	SSAM_SSH_TC_SEN = 0x12,
-	SSAM_SSH_TC_SRQ = 0x13,
-	SSAM_SSH_TC_MCU = 0x14,
-	SSAM_SSH_TC_HID = 0x15,	/* Generic HID input subsystem. */
-	SSAM_SSH_TC_TCH = 0x16,
-	SSAM_SSH_TC_BKL = 0x17,
-	SSAM_SSH_TC_TAM = 0x18,
-	SSAM_SSH_TC_ACC = 0x19,
-	SSAM_SSH_TC_UFI = 0x1a,
-	SSAM_SSH_TC_USC = 0x1b,
-	SSAM_SSH_TC_PEN = 0x1c,
-	SSAM_SSH_TC_VID = 0x1d,
-	SSAM_SSH_TC_AUD = 0x1e,
-	SSAM_SSH_TC_SMC = 0x1f,
-	SSAM_SSH_TC_KPD = 0x20,
-	SSAM_SSH_TC_REG = 0x21,	/* Extended event registry. */
+				  /* Category 0x00 is invalid for EC use. */
+	SSAM_SSH_TC_SAM  = 0x01,  /* Generic system functionality, real-time clock. */
+	SSAM_SSH_TC_BAT  = 0x02,  /* Battery/power subsystem. */
+	SSAM_SSH_TC_TMP  = 0x03,  /* Thermal subsystem. */
+	SSAM_SSH_TC_PMC  = 0x04,
+	SSAM_SSH_TC_FAN  = 0x05,
+	SSAM_SSH_TC_PoM  = 0x06,
+	SSAM_SSH_TC_DBG  = 0x07,
+	SSAM_SSH_TC_KBD  = 0x08,  /* Legacy keyboard (Laptop 1/2). */
+	SSAM_SSH_TC_FWU  = 0x09,
+	SSAM_SSH_TC_UNI  = 0x0a,
+	SSAM_SSH_TC_LPC  = 0x0b,
+	SSAM_SSH_TC_TCL  = 0x0c,
+	SSAM_SSH_TC_SFL  = 0x0d,
+	SSAM_SSH_TC_KIP  = 0x0e,  /* Manages detachable peripherals (Pro X/8 keyboard cover) */
+	SSAM_SSH_TC_EXT  = 0x0f,
+	SSAM_SSH_TC_BLD  = 0x10,
+	SSAM_SSH_TC_BAS  = 0x11,  /* Detachment system (Surface Book 2/3). */
+	SSAM_SSH_TC_SEN  = 0x12,
+	SSAM_SSH_TC_SRQ  = 0x13,
+	SSAM_SSH_TC_MCU  = 0x14,
+	SSAM_SSH_TC_HID  = 0x15,  /* Generic HID input subsystem. */
+	SSAM_SSH_TC_TCH  = 0x16,
+	SSAM_SSH_TC_BKL  = 0x17,
+	SSAM_SSH_TC_TAM  = 0x18,
+	SSAM_SSH_TC_ACC0 = 0x19,
+	SSAM_SSH_TC_UFI  = 0x1a,
+	SSAM_SSH_TC_USC  = 0x1b,
+	SSAM_SSH_TC_PEN  = 0x1c,
+	SSAM_SSH_TC_VID  = 0x1d,
+	SSAM_SSH_TC_AUD  = 0x1e,
+	SSAM_SSH_TC_SMC  = 0x1f,
+	SSAM_SSH_TC_KPD  = 0x20,
+	SSAM_SSH_TC_REG  = 0x21,  /* Extended event registry. */
+	SSAM_SSH_TC_SPT  = 0x22,
+	SSAM_SSH_TC_SYS  = 0x23,
+	SSAM_SSH_TC_ACC1 = 0x24,
+	SSAM_SSH_TC_SHB  = 0x25,
+	SSAM_SSH_TC_POS  = 0x26,  /* For obtaining Laptop Studio screen position. */
 };
 
 
-- 
cgit v1.2.3


From 1a3c7d0841ae24d28a15bbf87ee7d08614cec957 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Sat, 11 Jun 2022 01:25:11 -0700
Subject: swiotlb: remove the unused swiotlb_force declaration

The 'swiotlb_force' is removed since commit c6af2aa9ffc9 ("swiotlb: make
the swiotlb_init interface more useful").

Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 7ed35dd3de6e..bdc58a0e20b1 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -60,7 +60,6 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #ifdef CONFIG_SWIOTLB
-extern enum swiotlb_force swiotlb_force;
 
 /**
  * struct io_tlb_mem - IO TLB Memory Pool Descriptor
-- 
cgit v1.2.3


From 62c0aff64c8d6594357b6217db019552ea664b90 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Jun 2022 20:11:47 +0300
Subject: clk: Remove never used devm_clk_*unregister()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the entire history of the devm_clk_*unregister() existence they were
used only once (*) in 2015. Remove them.

*) The commit 264e3b75de4e ("clk: s2mps11: Simplify s2mps11_clk_probe unwind
   paths") exactly supports the point of the change proposed here.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20220622171147.85603-1-andriy.shevchenko@linux.intel.com
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 48 --------------------------------------------
 include/linux/clk-provider.h |  2 --
 2 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index f00d4c1158d7..7fc191c15507 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4279,54 +4279,6 @@ int devm_clk_hw_register(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register);
 
-static int devm_clk_match(struct device *dev, void *res, void *data)
-{
-	struct clk *c = res;
-	if (WARN_ON(!c))
-		return 0;
-	return c == data;
-}
-
-static int devm_clk_hw_match(struct device *dev, void *res, void *data)
-{
-	struct clk_hw *hw = res;
-
-	if (WARN_ON(!hw))
-		return 0;
-	return hw == data;
-}
-
-/**
- * devm_clk_unregister - resource managed clk_unregister()
- * @dev: device that is unregistering the clock data
- * @clk: clock to unregister
- *
- * Deallocate a clock allocated with devm_clk_register(). Normally
- * this function will not need to be called and the resource management
- * code will ensure that the resource is freed.
- */
-void devm_clk_unregister(struct device *dev, struct clk *clk)
-{
-	WARN_ON(devres_release(dev, devm_clk_unregister_cb, devm_clk_match, clk));
-}
-EXPORT_SYMBOL_GPL(devm_clk_unregister);
-
-/**
- * devm_clk_hw_unregister - resource managed clk_hw_unregister()
- * @dev: device that is unregistering the hardware-specific clock data
- * @hw: link to hardware-specific clock data
- *
- * Unregister a clk_hw registered with devm_clk_hw_register(). Normally
- * this function will not need to be called and the resource management
- * code will ensure that the resource is freed.
- */
-void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
-{
-	WARN_ON(devres_release(dev, devm_clk_hw_unregister_cb, devm_clk_hw_match,
-				hw));
-}
-EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
-
 static void devm_clk_release(struct device *dev, void *res)
 {
 	clk_put(*(struct clk **)res);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c10dc4c659e2..72d937c03a3e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1176,10 +1176,8 @@ int __must_check devm_clk_hw_register(struct device *dev, struct clk_hw *hw);
 int __must_check of_clk_hw_register(struct device_node *node, struct clk_hw *hw);
 
 void clk_unregister(struct clk *clk);
-void devm_clk_unregister(struct device *dev, struct clk *clk);
 
 void clk_hw_unregister(struct clk_hw *hw);
-void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
 
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
-- 
cgit v1.2.3


From 203184571388a988283543f0fd7da1a0da7c3f91 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:53 -0500
Subject: dmaengine: dw-edma: Detach the private data and chip info structures

"struct dw_edma_chip" contains an internal structure "struct dw_edma" that
is used by the eDMA core internally and should not be touched by the eDMA
controller drivers themselves. But currently, the eDMA controller drivers
like "dw-edma-pci" allocate and populate this internal structure before
passing it on to the eDMA core. The eDMA core further populates the
structure and uses it. This is wrong!

Hence, move all the "struct dw_edma" specifics from controller drivers to
the eDMA core.

Link: https://lore.kernel.org/r/20220524152159.2370739-3-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-core.c       | 90 +++++++++++++++++---------------
 drivers/dma/dw-edma/dw-edma-core.h       | 31 ++---------
 drivers/dma/dw-edma/dw-edma-pcie.c       | 82 ++++++++++++-----------------
 drivers/dma/dw-edma/dw-edma-v0-core.c    | 32 ++++++------
 drivers/dma/dw-edma/dw-edma-v0-core.h    |  4 +-
 drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 18 +++----
 drivers/dma/dw-edma/dw-edma-v0-debugfs.h |  8 +--
 include/linux/dma/edma.h                 | 48 ++++++++++++++++-
 8 files changed, 164 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 468d1097a1ec..9a4c96f7d9d9 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -64,8 +64,8 @@ static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk)
 
 static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
 {
+	struct dw_edma_chip *chip = desc->chan->dw->chip;
 	struct dw_edma_chan *chan = desc->chan;
-	struct dw_edma *dw = chan->chip->dw;
 	struct dw_edma_chunk *chunk;
 
 	chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
@@ -82,11 +82,11 @@ static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
 	 */
 	chunk->cb = !(desc->chunks_alloc % 2);
 	if (chan->dir == EDMA_DIR_WRITE) {
-		chunk->ll_region.paddr = dw->ll_region_wr[chan->id].paddr;
-		chunk->ll_region.vaddr = dw->ll_region_wr[chan->id].vaddr;
+		chunk->ll_region.paddr = chip->ll_region_wr[chan->id].paddr;
+		chunk->ll_region.vaddr = chip->ll_region_wr[chan->id].vaddr;
 	} else {
-		chunk->ll_region.paddr = dw->ll_region_rd[chan->id].paddr;
-		chunk->ll_region.vaddr = dw->ll_region_rd[chan->id].vaddr;
+		chunk->ll_region.paddr = chip->ll_region_rd[chan->id].paddr;
+		chunk->ll_region.vaddr = chip->ll_region_rd[chan->id].vaddr;
 	}
 
 	if (desc->chunk) {
@@ -663,7 +663,7 @@ static int dw_edma_alloc_chan_resources(struct dma_chan *dchan)
 	if (chan->status != EDMA_ST_IDLE)
 		return -EBUSY;
 
-	pm_runtime_get(chan->chip->dev);
+	pm_runtime_get(chan->dw->chip->dev);
 
 	return 0;
 }
@@ -685,15 +685,15 @@ static void dw_edma_free_chan_resources(struct dma_chan *dchan)
 		cpu_relax();
 	}
 
-	pm_runtime_put(chan->chip->dev);
+	pm_runtime_put(chan->dw->chip->dev);
 }
 
-static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
+static int dw_edma_channel_setup(struct dw_edma *dw, bool write,
 				 u32 wr_alloc, u32 rd_alloc)
 {
+	struct dw_edma_chip *chip = dw->chip;
 	struct dw_edma_region *dt_region;
 	struct device *dev = chip->dev;
-	struct dw_edma *dw = chip->dw;
 	struct dw_edma_chan *chan;
 	struct dw_edma_irq *irq;
 	struct dma_device *dma;
@@ -726,7 +726,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 
 		chan->vc.chan.private = dt_region;
 
-		chan->chip = chip;
+		chan->dw = dw;
 		chan->id = j;
 		chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ;
 		chan->configured = false;
@@ -734,9 +734,9 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 		chan->status = EDMA_ST_IDLE;
 
 		if (write)
-			chan->ll_max = (dw->ll_region_wr[j].sz / EDMA_LL_SZ);
+			chan->ll_max = (chip->ll_region_wr[j].sz / EDMA_LL_SZ);
 		else
-			chan->ll_max = (dw->ll_region_rd[j].sz / EDMA_LL_SZ);
+			chan->ll_max = (chip->ll_region_rd[j].sz / EDMA_LL_SZ);
 		chan->ll_max -= 1;
 
 		dev_vdbg(dev, "L. List:\tChannel %s[%u] max_cnt=%u\n",
@@ -766,13 +766,13 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
 		vchan_init(&chan->vc, dma);
 
 		if (write) {
-			dt_region->paddr = dw->dt_region_wr[j].paddr;
-			dt_region->vaddr = dw->dt_region_wr[j].vaddr;
-			dt_region->sz = dw->dt_region_wr[j].sz;
+			dt_region->paddr = chip->dt_region_wr[j].paddr;
+			dt_region->vaddr = chip->dt_region_wr[j].vaddr;
+			dt_region->sz = chip->dt_region_wr[j].sz;
 		} else {
-			dt_region->paddr = dw->dt_region_rd[j].paddr;
-			dt_region->vaddr = dw->dt_region_rd[j].vaddr;
-			dt_region->sz = dw->dt_region_rd[j].sz;
+			dt_region->paddr = chip->dt_region_rd[j].paddr;
+			dt_region->vaddr = chip->dt_region_rd[j].vaddr;
+			dt_region->sz = chip->dt_region_rd[j].sz;
 		}
 
 		dw_edma_v0_core_device_config(chan);
@@ -826,11 +826,11 @@ static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt)
 		(*mask)++;
 }
 
-static int dw_edma_irq_request(struct dw_edma_chip *chip,
+static int dw_edma_irq_request(struct dw_edma *dw,
 			       u32 *wr_alloc, u32 *rd_alloc)
 {
-	struct device *dev = chip->dev;
-	struct dw_edma *dw = chip->dw;
+	struct dw_edma_chip *chip = dw->chip;
+	struct device *dev = dw->chip->dev;
 	u32 wr_mask = 1;
 	u32 rd_mask = 1;
 	int i, err = 0;
@@ -839,12 +839,16 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 
 	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
 
-	if (dw->nr_irqs < 1)
+	if (chip->nr_irqs < 1 || !chip->ops->irq_vector)
 		return -EINVAL;
 
-	if (dw->nr_irqs == 1) {
+	dw->irq = devm_kcalloc(dev, chip->nr_irqs, sizeof(*dw->irq), GFP_KERNEL);
+	if (!dw->irq)
+		return -ENOMEM;
+
+	if (chip->nr_irqs == 1) {
 		/* Common IRQ shared among all channels */
-		irq = dw->ops->irq_vector(dev, 0);
+		irq = chip->ops->irq_vector(dev, 0);
 		err = request_irq(irq, dw_edma_interrupt_common,
 				  IRQF_SHARED, dw->name, &dw->irq[0]);
 		if (err) {
@@ -854,9 +858,11 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 
 		if (irq_get_msi_desc(irq))
 			get_cached_msi_msg(irq, &dw->irq[0].msi);
+
+		dw->nr_irqs = 1;
 	} else {
 		/* Distribute IRQs equally among all channels */
-		int tmp = dw->nr_irqs;
+		int tmp = chip->nr_irqs;
 
 		while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) {
 			dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt);
@@ -867,7 +873,7 @@ static int dw_edma_irq_request(struct dw_edma_chip *chip,
 		dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt);
 
 		for (i = 0; i < (*wr_alloc + *rd_alloc); i++) {
-			irq = dw->ops->irq_vector(dev, i);
+			irq = chip->ops->irq_vector(dev, i);
 			err = request_irq(irq,
 					  i < *wr_alloc ?
 						dw_edma_interrupt_write :
@@ -901,20 +907,22 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 		return -EINVAL;
 
 	dev = chip->dev;
-	if (!dev)
+	if (!dev || !chip->ops)
 		return -EINVAL;
 
-	dw = chip->dw;
-	if (!dw || !dw->irq || !dw->ops || !dw->ops->irq_vector)
-		return -EINVAL;
+	dw = devm_kzalloc(dev, sizeof(*dw), GFP_KERNEL);
+	if (!dw)
+		return -ENOMEM;
+
+	dw->chip = chip;
 
 	raw_spin_lock_init(&dw->lock);
 
-	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt,
+	dw->wr_ch_cnt = min_t(u16, chip->wr_ch_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE));
 	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH);
 
-	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt,
+	dw->rd_ch_cnt = min_t(u16, chip->rd_ch_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ));
 	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH);
 
@@ -936,17 +944,17 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 	dw_edma_v0_core_off(dw);
 
 	/* Request IRQs */
-	err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc);
+	err = dw_edma_irq_request(dw, &wr_alloc, &rd_alloc);
 	if (err)
 		return err;
 
 	/* Setup write channels */
-	err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc);
+	err = dw_edma_channel_setup(dw, true, wr_alloc, rd_alloc);
 	if (err)
 		goto err_irq_free;
 
 	/* Setup read channels */
-	err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc);
+	err = dw_edma_channel_setup(dw, false, wr_alloc, rd_alloc);
 	if (err)
 		goto err_irq_free;
 
@@ -954,15 +962,15 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 	pm_runtime_enable(dev);
 
 	/* Turn debugfs on */
-	dw_edma_v0_core_debugfs_on(chip);
+	dw_edma_v0_core_debugfs_on(dw);
+
+	chip->dw = dw;
 
 	return 0;
 
 err_irq_free:
 	for (i = (dw->nr_irqs - 1); i >= 0; i--)
-		free_irq(dw->ops->irq_vector(dev, i), &dw->irq[i]);
-
-	dw->nr_irqs = 0;
+		free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]);
 
 	return err;
 }
@@ -980,7 +988,7 @@ int dw_edma_remove(struct dw_edma_chip *chip)
 
 	/* Free irqs */
 	for (i = (dw->nr_irqs - 1); i >= 0; i--)
-		free_irq(dw->ops->irq_vector(dev, i), &dw->irq[i]);
+		free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]);
 
 	/* Power management */
 	pm_runtime_disable(dev);
@@ -1001,7 +1009,7 @@ int dw_edma_remove(struct dw_edma_chip *chip)
 	}
 
 	/* Turn debugfs off */
-	dw_edma_v0_core_debugfs_off(chip);
+	dw_edma_v0_core_debugfs_off(dw);
 
 	return 0;
 }
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 60316d408c3e..85df2d511907 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -15,20 +15,12 @@
 #include "../virt-dma.h"
 
 #define EDMA_LL_SZ					24
-#define EDMA_MAX_WR_CH					8
-#define EDMA_MAX_RD_CH					8
 
 enum dw_edma_dir {
 	EDMA_DIR_WRITE = 0,
 	EDMA_DIR_READ
 };
 
-enum dw_edma_map_format {
-	EDMA_MF_EDMA_LEGACY = 0x0,
-	EDMA_MF_EDMA_UNROLL = 0x1,
-	EDMA_MF_HDMA_COMPAT = 0x5
-};
-
 enum dw_edma_request {
 	EDMA_REQ_NONE = 0,
 	EDMA_REQ_STOP,
@@ -57,12 +49,6 @@ struct dw_edma_burst {
 	u32				sz;
 };
 
-struct dw_edma_region {
-	phys_addr_t			paddr;
-	void				__iomem *vaddr;
-	size_t				sz;
-};
-
 struct dw_edma_chunk {
 	struct list_head		list;
 	struct dw_edma_chan		*chan;
@@ -87,7 +73,7 @@ struct dw_edma_desc {
 
 struct dw_edma_chan {
 	struct virt_dma_chan		vc;
-	struct dw_edma_chip		*chip;
+	struct dw_edma			*dw;
 	int				id;
 	enum dw_edma_dir		dir;
 
@@ -109,10 +95,6 @@ struct dw_edma_irq {
 	struct dw_edma			*dw;
 };
 
-struct dw_edma_core_ops {
-	int	(*irq_vector)(struct device *dev, unsigned int nr);
-};
-
 struct dw_edma {
 	char				name[20];
 
@@ -122,21 +104,14 @@ struct dw_edma {
 	struct dma_device		rd_edma;
 	u16				rd_ch_cnt;
 
-	struct dw_edma_region		rg_region;	/* Registers */
-	struct dw_edma_region		ll_region_wr[EDMA_MAX_WR_CH];
-	struct dw_edma_region		ll_region_rd[EDMA_MAX_RD_CH];
-	struct dw_edma_region		dt_region_wr[EDMA_MAX_WR_CH];
-	struct dw_edma_region		dt_region_rd[EDMA_MAX_RD_CH];
-
 	struct dw_edma_irq		*irq;
 	int				nr_irqs;
 
-	enum dw_edma_map_format		mf;
-
 	struct dw_edma_chan		*chan;
-	const struct dw_edma_core_ops	*ops;
 
 	raw_spinlock_t			lock;		/* Only for legacy */
+
+	struct dw_edma_chip             *chip;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry			*debugfs;
 #endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index bc07923c3bc0..965513698f3d 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -148,7 +148,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	struct dw_edma_pcie_data vsec_data;
 	struct device *dev = &pdev->dev;
 	struct dw_edma_chip *chip;
-	struct dw_edma *dw;
 	int err, nr_irqs;
 	int i, mask;
 
@@ -197,10 +196,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	if (!chip)
 		return -ENOMEM;
 
-	dw = devm_kzalloc(dev, sizeof(*dw), GFP_KERNEL);
-	if (!dw)
-		return -ENOMEM;
-
 	/* IRQs allocation */
 	nr_irqs = pci_alloc_irq_vectors(pdev, 1, vsec_data.irqs,
 					PCI_IRQ_MSI | PCI_IRQ_MSIX);
@@ -211,28 +206,23 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	}
 
 	/* Data structure initialization */
-	chip->dw = dw;
 	chip->dev = dev;
 	chip->id = pdev->devfn;
 
-	dw->mf = vsec_data.mf;
-	dw->nr_irqs = nr_irqs;
-	dw->ops = &dw_edma_pcie_core_ops;
-	dw->wr_ch_cnt = vsec_data.wr_ch_cnt;
-	dw->rd_ch_cnt = vsec_data.rd_ch_cnt;
+	chip->mf = vsec_data.mf;
+	chip->nr_irqs = nr_irqs;
+	chip->ops = &dw_edma_pcie_core_ops;
 
-	dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
-	if (!dw->rg_region.vaddr)
-		return -ENOMEM;
+	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
+	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
 
-	dw->rg_region.vaddr += vsec_data.rg.off;
-	dw->rg_region.paddr = pdev->resource[vsec_data.rg.bar].start;
-	dw->rg_region.paddr += vsec_data.rg.off;
-	dw->rg_region.sz = vsec_data.rg.sz;
+	chip->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
+	if (!chip->rg_region.vaddr)
+		return -ENOMEM;
 
-	for (i = 0; i < dw->wr_ch_cnt; i++) {
-		struct dw_edma_region *ll_region = &dw->ll_region_wr[i];
-		struct dw_edma_region *dt_region = &dw->dt_region_wr[i];
+	for (i = 0; i < chip->wr_ch_cnt; i++) {
+		struct dw_edma_region *ll_region = &chip->ll_region_wr[i];
+		struct dw_edma_region *dt_region = &chip->dt_region_wr[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_wr[i];
 		struct dw_edma_block *dt_block = &vsec_data.dt_wr[i];
 
@@ -255,9 +245,9 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		dt_region->sz = dt_block->sz;
 	}
 
-	for (i = 0; i < dw->rd_ch_cnt; i++) {
-		struct dw_edma_region *ll_region = &dw->ll_region_rd[i];
-		struct dw_edma_region *dt_region = &dw->dt_region_rd[i];
+	for (i = 0; i < chip->rd_ch_cnt; i++) {
+		struct dw_edma_region *ll_region = &chip->ll_region_rd[i];
+		struct dw_edma_region *dt_region = &chip->dt_region_rd[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_rd[i];
 		struct dw_edma_block *dt_block = &vsec_data.dt_rd[i];
 
@@ -281,45 +271,45 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	}
 
 	/* Debug info */
-	if (dw->mf == EDMA_MF_EDMA_LEGACY)
-		pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", dw->mf);
-	else if (dw->mf == EDMA_MF_EDMA_UNROLL)
-		pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", dw->mf);
-	else if (dw->mf == EDMA_MF_HDMA_COMPAT)
-		pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", dw->mf);
+	if (chip->mf == EDMA_MF_EDMA_LEGACY)
+		pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", chip->mf);
+	else if (chip->mf == EDMA_MF_EDMA_UNROLL)
+		pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", chip->mf);
+	else if (chip->mf == EDMA_MF_HDMA_COMPAT)
+		pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", chip->mf);
 	else
-		pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf);
+		pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", chip->mf);
 
-	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
+	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p)\n",
 		vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz,
-		dw->rg_region.vaddr, &dw->rg_region.paddr);
+		chip->rg_region.vaddr);
 
 
-	for (i = 0; i < dw->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->wr_ch_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_wr[i].bar,
-			vsec_data.ll_wr[i].off, dw->ll_region_wr[i].sz,
-			dw->ll_region_wr[i].vaddr, &dw->ll_region_wr[i].paddr);
+			vsec_data.ll_wr[i].off, chip->ll_region_wr[i].sz,
+			chip->ll_region_wr[i].vaddr, &chip->ll_region_wr[i].paddr);
 
 		pci_dbg(pdev, "Data:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.dt_wr[i].bar,
-			vsec_data.dt_wr[i].off, dw->dt_region_wr[i].sz,
-			dw->dt_region_wr[i].vaddr, &dw->dt_region_wr[i].paddr);
+			vsec_data.dt_wr[i].off, chip->dt_region_wr[i].sz,
+			chip->dt_region_wr[i].vaddr, &chip->dt_region_wr[i].paddr);
 	}
 
-	for (i = 0; i < dw->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->rd_ch_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_rd[i].bar,
-			vsec_data.ll_rd[i].off, dw->ll_region_rd[i].sz,
-			dw->ll_region_rd[i].vaddr, &dw->ll_region_rd[i].paddr);
+			vsec_data.ll_rd[i].off, chip->ll_region_rd[i].sz,
+			chip->ll_region_rd[i].vaddr, &chip->ll_region_rd[i].paddr);
 
 		pci_dbg(pdev, "Data:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.dt_rd[i].bar,
-			vsec_data.dt_rd[i].off, dw->dt_region_rd[i].sz,
-			dw->dt_region_rd[i].vaddr, &dw->dt_region_rd[i].paddr);
+			vsec_data.dt_rd[i].off, chip->dt_region_rd[i].sz,
+			chip->dt_region_rd[i].vaddr, &chip->dt_region_rd[i].paddr);
 	}
 
-	pci_dbg(pdev, "Nr. IRQs:\t%u\n", dw->nr_irqs);
+	pci_dbg(pdev, "Nr. IRQs:\t%u\n", chip->nr_irqs);
 
 	/* Validating if PCI interrupts were enabled */
 	if (!pci_dev_msi_enabled(pdev)) {
@@ -327,10 +317,6 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		return -EPERM;
 	}
 
-	dw->irq = devm_kcalloc(dev, nr_irqs, sizeof(*dw->irq), GFP_KERNEL);
-	if (!dw->irq)
-		return -ENOMEM;
-
 	/* Starting eDMA driver */
 	err = dw_edma_probe(chip);
 	if (err) {
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 33bc1e6c4cf2..999e03896186 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -25,7 +25,7 @@ enum dw_edma_control {
 
 static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 {
-	return dw->rg_region.vaddr;
+	return dw->chip->rg_region.vaddr;
 }
 
 #define SET_32(dw, name, value)				\
@@ -96,7 +96,7 @@ static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 static inline struct dw_edma_v0_ch_regs __iomem *
 __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY)
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY)
 		return &(__dw_regs(dw)->type.legacy.ch);
 
 	if (dir == EDMA_DIR_WRITE)
@@ -108,7 +108,7 @@ __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch)
 static inline void writel_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 			     u32 value, void __iomem *addr)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -133,7 +133,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 {
 	u32 value;
 
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -169,7 +169,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 static inline void writeq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 			     u64 value, void __iomem *addr)
 {
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -194,7 +194,7 @@ static inline u64 readq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch,
 {
 	u32 value;
 
-	if (dw->mf == EDMA_MF_EDMA_LEGACY) {
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY) {
 		u32 viewport_sel;
 		unsigned long flags;
 
@@ -256,7 +256,7 @@ u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir)
 
 enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp;
 
 	tmp = FIELD_GET(EDMA_V0_CH_STATUS_MASK,
@@ -272,7 +272,7 @@ enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan)
 
 void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 
 	SET_RW_32(dw, chan->dir, int_clear,
 		  FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)));
@@ -280,7 +280,7 @@ void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan)
 
 void dw_edma_v0_core_clear_abort_int(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 
 	SET_RW_32(dw, chan->dir, int_clear,
 		  FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id)));
@@ -357,7 +357,7 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 {
 	struct dw_edma_chan *chan = chunk->chan;
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp;
 
 	dw_edma_v0_core_write_chunk(chunk);
@@ -365,7 +365,7 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 	if (first) {
 		/* Enable engine */
 		SET_RW_32(dw, chan->dir, engine_en, BIT(0));
-		if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+		if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 			switch (chan->id) {
 			case 0:
 				SET_RW_COMPAT(dw, chan->dir, ch0_pwr_en,
@@ -435,7 +435,7 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first)
 
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
 {
-	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma *dw = chan->dw;
 	u32 tmp = 0;
 
 	/* MSI done addr - low, high */
@@ -505,12 +505,12 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan)
 }
 
 /* eDMA debugfs callbacks */
-void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip)
+void dw_edma_v0_core_debugfs_on(struct dw_edma *dw)
 {
-	dw_edma_v0_debugfs_on(chip);
+	dw_edma_v0_debugfs_on(dw);
 }
 
-void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip)
+void dw_edma_v0_core_debugfs_off(struct dw_edma *dw)
 {
-	dw_edma_v0_debugfs_off(chip);
+	dw_edma_v0_debugfs_off(dw);
 }
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.h b/drivers/dma/dw-edma/dw-edma-v0-core.h
index 2afa626b8300..75aec6d31b21 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.h
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.h
@@ -22,7 +22,7 @@ u32 dw_edma_v0_core_status_abort_int(struct dw_edma *chan, enum dw_edma_dir dir)
 void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first);
 int dw_edma_v0_core_device_config(struct dw_edma_chan *chan);
 /* eDMA debug fs callbacks */
-void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip);
+void dw_edma_v0_core_debugfs_on(struct dw_edma *dw);
+void dw_edma_v0_core_debugfs_off(struct dw_edma *dw);
 
 #endif /* _DW_EDMA_V0_CORE_H */
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
index 4b3bcffd15ef..b765adb96999 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
@@ -54,7 +54,7 @@ struct debugfs_entries {
 static int dw_edma_debugfs_u32_get(void *data, u64 *val)
 {
 	void __iomem *reg = (void __force __iomem *)data;
-	if (dw->mf == EDMA_MF_EDMA_LEGACY &&
+	if (dw->chip->mf == EDMA_MF_EDMA_LEGACY &&
 	    reg >= (void __iomem *)&regs->type.legacy.ch) {
 		void __iomem *ptr = &regs->type.legacy.ch;
 		u32 viewport_sel = 0;
@@ -173,7 +173,7 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir)
 	nr_entries = ARRAY_SIZE(debugfs_regs);
 	dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-	if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+	if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 		nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
 		dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
 					   regs_dir);
@@ -242,7 +242,7 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir)
 	nr_entries = ARRAY_SIZE(debugfs_regs);
 	dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir);
 
-	if (dw->mf == EDMA_MF_HDMA_COMPAT) {
+	if (dw->chip->mf == EDMA_MF_HDMA_COMPAT) {
 		nr_entries = ARRAY_SIZE(debugfs_unroll_regs);
 		dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries,
 					   regs_dir);
@@ -282,13 +282,13 @@ static void dw_edma_debugfs_regs(void)
 	dw_edma_debugfs_regs_rd(regs_dir);
 }
 
-void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
+void dw_edma_v0_debugfs_on(struct dw_edma *_dw)
 {
-	dw = chip->dw;
+	dw = _dw;
 	if (!dw)
 		return;
 
-	regs = dw->rg_region.vaddr;
+	regs = dw->chip->rg_region.vaddr;
 	if (!regs)
 		return;
 
@@ -296,16 +296,16 @@ void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
 	if (!dw->debugfs)
 		return;
 
-	debugfs_create_u32("mf", 0444, dw->debugfs, &dw->mf);
+	debugfs_create_u32("mf", 0444, dw->debugfs, &dw->chip->mf);
 	debugfs_create_u16("wr_ch_cnt", 0444, dw->debugfs, &dw->wr_ch_cnt);
 	debugfs_create_u16("rd_ch_cnt", 0444, dw->debugfs, &dw->rd_ch_cnt);
 
 	dw_edma_debugfs_regs();
 }
 
-void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
+void dw_edma_v0_debugfs_off(struct dw_edma *_dw)
 {
-	dw = chip->dw;
+	dw = _dw;
 	if (!dw)
 		return;
 
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
index d0ff25a9ea5c..3391b86edf5a 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h
@@ -12,14 +12,14 @@
 #include <linux/dma/edma.h>
 
 #ifdef CONFIG_DEBUG_FS
-void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip);
-void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip);
+void dw_edma_v0_debugfs_on(struct dw_edma *dw);
+void dw_edma_v0_debugfs_off(struct dw_edma *dw);
 #else
-static inline void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip)
+static inline void dw_edma_v0_debugfs_on(struct dw_edma *dw)
 {
 }
 
-static inline void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip)
+static inline void dw_edma_v0_debugfs_off(struct dw_edma *dw)
 {
 }
 #endif /* CONFIG_DEBUG_FS */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index d4333e721588..6f64e90d5c38 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -12,17 +12,63 @@
 #include <linux/device.h>
 #include <linux/dmaengine.h>
 
+#define EDMA_MAX_WR_CH                                  8
+#define EDMA_MAX_RD_CH                                  8
+
 struct dw_edma;
 
+struct dw_edma_region {
+	phys_addr_t	paddr;
+	void __iomem	*vaddr;
+	size_t		sz;
+};
+
+struct dw_edma_core_ops {
+	int (*irq_vector)(struct device *dev, unsigned int nr);
+};
+
+enum dw_edma_map_format {
+	EDMA_MF_EDMA_LEGACY = 0x0,
+	EDMA_MF_EDMA_UNROLL = 0x1,
+	EDMA_MF_HDMA_COMPAT = 0x5
+};
+
 /**
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
- * @dw:			 struct dw_edma that is filed by dw_edma_probe()
+ * @nr_irqs:		 total number of DMA IRQs
+ * @ops			 DMA channel to IRQ number mapping
+ * @wr_ch_cnt		 DMA write channel number
+ * @rd_ch_cnt		 DMA read channel number
+ * @rg_region		 DMA register region
+ * @ll_region_wr	 DMA descriptor link list memory for write channel
+ * @ll_region_rd	 DMA descriptor link list memory for read channel
+ * @dt_region_wr	 DMA data memory for write channel
+ * @dt_region_rd	 DMA data memory for read channel
+ * @mf			 DMA register map format
+ * @dw:			 struct dw_edma that is filled by dw_edma_probe()
  */
 struct dw_edma_chip {
 	struct device		*dev;
 	int			id;
+	int			nr_irqs;
+	const struct dw_edma_core_ops   *ops;
+
+	struct dw_edma_region	rg_region;
+
+	u16			wr_ch_cnt;
+	u16			rd_ch_cnt;
+	/* link list address */
+	struct dw_edma_region	ll_region_wr[EDMA_MAX_WR_CH];
+	struct dw_edma_region	ll_region_rd[EDMA_MAX_RD_CH];
+
+	/* data region */
+	struct dw_edma_region	dt_region_wr[EDMA_MAX_WR_CH];
+	struct dw_edma_region	dt_region_rd[EDMA_MAX_RD_CH];
+
+	enum dw_edma_map_format	mf;
+
 	struct dw_edma		*dw;
 };
 
-- 
cgit v1.2.3


From e51b3048116a6e10b96bd5298cbcb209b6d729cd Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:54 -0500
Subject: dmaengine: dw-edma: Change rg_region to reg_base in struct
 dw_edma_chip

struct dw_edma_region rg_region included virtual address, physical address
and size information. But only the virtual address is used by EDMA driver.
Change it to void __iomem *reg_base to clean up code.

Link: https://lore.kernel.org/r/20220524152159.2370739-4-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-pcie.c       | 6 +++---
 drivers/dma/dw-edma/dw-edma-v0-core.c    | 2 +-
 drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 2 +-
 include/linux/dma/edma.h                 | 3 ++-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index 965513698f3d..9553fb75091d 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -216,8 +216,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
 	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
 
-	chip->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar];
-	if (!chip->rg_region.vaddr)
+	chip->reg_base = pcim_iomap_table(pdev)[vsec_data.rg.bar];
+	if (!chip->reg_base)
 		return -ENOMEM;
 
 	for (i = 0; i < chip->wr_ch_cnt; i++) {
@@ -282,7 +282,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 
 	pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p)\n",
 		vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz,
-		chip->rg_region.vaddr);
+		chip->reg_base);
 
 
 	for (i = 0; i < chip->wr_ch_cnt; i++) {
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 999e03896186..403ade40c1b1 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -25,7 +25,7 @@ enum dw_edma_control {
 
 static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw)
 {
-	return dw->chip->rg_region.vaddr;
+	return dw->chip->reg_base;
 }
 
 #define SET_32(dw, name, value)				\
diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
index b765adb96999..5226c9014703 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c
@@ -288,7 +288,7 @@ void dw_edma_v0_debugfs_on(struct dw_edma *_dw)
 	if (!dw)
 		return;
 
-	regs = dw->chip->rg_region.vaddr;
+	regs = dw->chip->reg_base;
 	if (!regs)
 		return;
 
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index 6f64e90d5c38..df9ba3ecb437 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -39,6 +39,7 @@ enum dw_edma_map_format {
  * @id:			 instance ID
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
+ * @reg_base		 DMA register base address
  * @wr_ch_cnt		 DMA write channel number
  * @rd_ch_cnt		 DMA read channel number
  * @rg_region		 DMA register region
@@ -55,7 +56,7 @@ struct dw_edma_chip {
 	int			nr_irqs;
 	const struct dw_edma_core_ops   *ops;
 
-	struct dw_edma_region	rg_region;
+	void __iomem		*reg_base;
 
 	u16			wr_ch_cnt;
 	u16			rd_ch_cnt;
-- 
cgit v1.2.3


From 6951ee96c649f6e963b98c11b2b1a92697d3c45c Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:55 -0500
Subject: dmaengine: dw-edma: Rename wr(rd)_ch_cnt to ll_wr(rd)_cnt in struct
 dw_edma_chip

The struct dw_edma contains wr(rd)_ch_cnt fields. The EDMA driver gets
write(read) channel number from register, then saves these into dw_edma.
The wr(rd)_ch_cnt in dw_edma_chip actually means how many link list memory
are available in ll_region_wr(rd)[EDMA_MAX_WR_CH]. Rename it to
ll_wr(rd)_cnt to indicate actual usage.

Link: https://lore.kernel.org/r/20220524152159.2370739-5-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-core.c |  4 ++--
 drivers/dma/dw-edma/dw-edma-pcie.c | 12 ++++++------
 include/linux/dma/edma.h           |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 9a4c96f7d9d9..af037ec61a86 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -918,11 +918,11 @@ int dw_edma_probe(struct dw_edma_chip *chip)
 
 	raw_spin_lock_init(&dw->lock);
 
-	dw->wr_ch_cnt = min_t(u16, chip->wr_ch_cnt,
+	dw->wr_ch_cnt = min_t(u16, chip->ll_wr_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE));
 	dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH);
 
-	dw->rd_ch_cnt = min_t(u16, chip->rd_ch_cnt,
+	dw->rd_ch_cnt = min_t(u16, chip->ll_rd_cnt,
 			      dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ));
 	dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH);
 
diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c
index 9553fb75091d..d6b5e2463884 100644
--- a/drivers/dma/dw-edma/dw-edma-pcie.c
+++ b/drivers/dma/dw-edma/dw-edma-pcie.c
@@ -213,14 +213,14 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 	chip->nr_irqs = nr_irqs;
 	chip->ops = &dw_edma_pcie_core_ops;
 
-	chip->wr_ch_cnt = vsec_data.wr_ch_cnt;
-	chip->rd_ch_cnt = vsec_data.rd_ch_cnt;
+	chip->ll_wr_cnt = vsec_data.wr_ch_cnt;
+	chip->ll_rd_cnt = vsec_data.rd_ch_cnt;
 
 	chip->reg_base = pcim_iomap_table(pdev)[vsec_data.rg.bar];
 	if (!chip->reg_base)
 		return -ENOMEM;
 
-	for (i = 0; i < chip->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_wr_cnt; i++) {
 		struct dw_edma_region *ll_region = &chip->ll_region_wr[i];
 		struct dw_edma_region *dt_region = &chip->dt_region_wr[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_wr[i];
@@ -245,7 +245,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		dt_region->sz = dt_block->sz;
 	}
 
-	for (i = 0; i < chip->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_rd_cnt; i++) {
 		struct dw_edma_region *ll_region = &chip->ll_region_rd[i];
 		struct dw_edma_region *dt_region = &chip->dt_region_rd[i];
 		struct dw_edma_block *ll_block = &vsec_data.ll_rd[i];
@@ -285,7 +285,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 		chip->reg_base);
 
 
-	for (i = 0; i < chip->wr_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_wr_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_wr[i].bar,
 			vsec_data.ll_wr[i].off, chip->ll_region_wr[i].sz,
@@ -297,7 +297,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev,
 			chip->dt_region_wr[i].vaddr, &chip->dt_region_wr[i].paddr);
 	}
 
-	for (i = 0; i < chip->rd_ch_cnt; i++) {
+	for (i = 0; i < chip->ll_rd_cnt; i++) {
 		pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n",
 			i, vsec_data.ll_rd[i].bar,
 			vsec_data.ll_rd[i].off, chip->ll_region_rd[i].sz,
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index df9ba3ecb437..fdbbeda170a9 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -40,8 +40,8 @@ enum dw_edma_map_format {
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
  * @reg_base		 DMA register base address
- * @wr_ch_cnt		 DMA write channel number
- * @rd_ch_cnt		 DMA read channel number
+ * @ll_wr_cnt		 DMA write link list count
+ * @ll_rd_cnt		 DMA read link list count
  * @rg_region		 DMA register region
  * @ll_region_wr	 DMA descriptor link list memory for write channel
  * @ll_region_rd	 DMA descriptor link list memory for read channel
@@ -58,8 +58,8 @@ struct dw_edma_chip {
 
 	void __iomem		*reg_base;
 
-	u16			wr_ch_cnt;
-	u16			rd_ch_cnt;
+	u16			ll_wr_cnt;
+	u16			ll_rd_cnt;
 	/* link list address */
 	struct dw_edma_region	ll_region_wr[EDMA_MAX_WR_CH];
 	struct dw_edma_region	ll_region_rd[EDMA_MAX_RD_CH];
-- 
cgit v1.2.3


From d6b03171f9fc8127b3a7adfd4e74ee5d4dae5d14 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 24 May 2022 10:21:58 -0500
Subject: dmaengine: dw-edma: Add support for chip-specific flags

Add a "flags" field to the "struct dw_edma_chip" so that the controller
drivers can pass flags that are relevant to the platform.

DW_EDMA_CHIP_LOCAL - Used by the controller drivers accessing eDMA
locally. Local eDMA access doesn't require generating MSIs to the remote.

Link: https://lore.kernel.org/r/20220524152159.2370739-8-Frank.Li@nxp.com
Tested-by: Serge Semin <fancer.lancer@gmail.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-By: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-edma/dw-edma-v0-core.c |  9 ++++++---
 include/linux/dma/edma.h              | 10 ++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 403ade40c1b1..607647dacc29 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -301,6 +301,7 @@ u32 dw_edma_v0_core_status_abort_int(struct dw_edma *dw, enum dw_edma_dir dir)
 static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 {
 	struct dw_edma_burst *child;
+	struct dw_edma_chan *chan = chunk->chan;
 	struct dw_edma_v0_lli __iomem *lli;
 	struct dw_edma_v0_llp __iomem *llp;
 	u32 control = 0, i = 0;
@@ -314,9 +315,11 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk)
 	j = chunk->bursts_alloc;
 	list_for_each_entry(child, &chunk->burst->list, list) {
 		j--;
-		if (!j)
-			control |= (DW_EDMA_V0_LIE | DW_EDMA_V0_RIE);
-
+		if (!j) {
+			control |= DW_EDMA_V0_LIE;
+			if (!(chan->dw->chip->flags & DW_EDMA_CHIP_LOCAL))
+				control |= DW_EDMA_V0_RIE;
+		}
 		/* Channel control */
 		SET_LL_32(&lli[i].control, control);
 		/* Transfer size */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
index fdbbeda170a9..7d8062e9c544 100644
--- a/include/linux/dma/edma.h
+++ b/include/linux/dma/edma.h
@@ -33,12 +33,21 @@ enum dw_edma_map_format {
 	EDMA_MF_HDMA_COMPAT = 0x5
 };
 
+/**
+ * enum dw_edma_chip_flags - Flags specific to an eDMA chip
+ * @DW_EDMA_CHIP_LOCAL:		eDMA is used locally by an endpoint
+ */
+enum dw_edma_chip_flags {
+	DW_EDMA_CHIP_LOCAL	= BIT(0),
+};
+
 /**
  * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
  * @dev:		 struct device of the eDMA controller
  * @id:			 instance ID
  * @nr_irqs:		 total number of DMA IRQs
  * @ops			 DMA channel to IRQ number mapping
+ * @flags		 dw_edma_chip_flags
  * @reg_base		 DMA register base address
  * @ll_wr_cnt		 DMA write link list count
  * @ll_rd_cnt		 DMA read link list count
@@ -55,6 +64,7 @@ struct dw_edma_chip {
 	int			id;
 	int			nr_irqs;
 	const struct dw_edma_core_ops   *ops;
+	u32			flags;
 
 	void __iomem		*reg_base;
 
-- 
cgit v1.2.3


From ba1afa676d0babf99e99f5415db43fdd7ecef104 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 24 Jun 2022 17:31:47 +0800
Subject: lib: bitmap: fix the duplicated comments on bitmap_to_arr64()

Thanks to the recent commit 0a97953fd221 ("lib: add
bitmap_{from,to}_arr64") now we can directly convert a U64 value into a
bitmap and vice verse.

However when checking the header there is duplicated helper for
bitmap_to_arr64(), but no bitmap_from_arr64().

Just fix the copy-n-paste error.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 2e6cd5681040..f091a1664bf1 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -71,9 +71,9 @@ struct device;
  *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
  *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
  *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
+ *  bitmap_from_arr64(dst, buf, nbits)          Copy nbits from u64[] buf to dst
  *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
  *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
- *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
  *  bitmap_get_value8(map, start)               Get 8bit value from map at start
  *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
  *
-- 
cgit v1.2.3


From e61c451476e61450f6771ce03bbc01210a09be16 Mon Sep 17 00:00:00 2001
From: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Date: Fri, 22 Apr 2022 14:24:35 +0800
Subject: dma-mapping: Add dma_release_coherent_memory to DMA API

Add dma_release_coherent_memory to DMA API to allow dma
user call it to release dev->dma_mem when the device is
removed.

Signed-off-by: Mark-PK Tsai <mark-pk.tsai@mediatek.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220422062436.14384-2-mark-pk.tsai@mediatek.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/dma-map-ops.h |  3 +++
 kernel/dma/coherent.c       | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d5b06b3a4a6..53db9655efe9 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -166,6 +166,7 @@ static inline void dma_pernuma_cma_reserve(void) { }
 #ifdef CONFIG_DMA_DECLARE_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 		dma_addr_t device_addr, size_t size);
+void dma_release_coherent_memory(struct device *dev);
 int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 		dma_addr_t *dma_handle, void **ret);
 int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr);
@@ -177,6 +178,8 @@ static inline int dma_declare_coherent_memory(struct device *dev,
 {
 	return -ENOSYS;
 }
+
+#define dma_release_coherent_memory(dev) (0)
 #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 375fb3c9538d..c21abc77c53e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -74,7 +74,7 @@ out_unmap_membase:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
 {
 	if (!mem)
 		return;
@@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 
 	ret = dma_assign_coherent_memory(dev, mem);
 	if (ret)
-		dma_release_coherent_memory(mem);
+		_dma_release_coherent_memory(mem);
 	return ret;
 }
 
+void dma_release_coherent_memory(struct device *dev)
+{
+	if (dev)
+		_dma_release_coherent_memory(dev->dma_mem);
+}
+
 static void *__dma_alloc_from_coherent(struct device *dev,
 				       struct dma_coherent_mem *mem,
 				       ssize_t size, dma_addr_t *dma_handle)
-- 
cgit v1.2.3


From 0cae04373b77a117830e5f7d7aaa7eaf01f950d5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 Jun 2022 09:47:33 +0200
Subject: dmaengine: remove DMA_MEMCPY_SG once again

This was removed before due to the complete lack of users, but
3218910fd585 ("dmaengine: Add core function and capability check for
DMA_MEMCPY_SG") and 29cf37fa6dd9 ("dmaengine: Add consumer for the new
DMA_MEMCPY_SG API function.") added it back despite still not having
any users whatsoever.

Fixes: 3218910fd585 ("dmaengine: Add core function and capability check for DMA_MEMCPY_SG")
Fixes: 29cf37fa6dd9 ("dmaengine: Add consumer for the new DMA_MEMCPY_SG API function.")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Michal Simek <michal.simek@amd.com>
Link: https://lore.kernel.org/r/20220606074733.622616-1-hch@lst.de
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/driver-api/dmaengine/provider.rst |  10 --
 drivers/dma/dmaengine.c                         |   7 --
 drivers/dma/xilinx/xilinx_dma.c                 | 122 ------------------------
 include/linux/dmaengine.h                       |  20 ----
 4 files changed, 159 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/dmaengine/provider.rst b/Documentation/driver-api/dmaengine/provider.rst
index 1e0f1f85d10e..ceac2a300e32 100644
--- a/Documentation/driver-api/dmaengine/provider.rst
+++ b/Documentation/driver-api/dmaengine/provider.rst
@@ -162,16 +162,6 @@ Currently, the types available are:
 
   - The device is able to do memory to memory copies
 
-- - DMA_MEMCPY_SG
-
-  - The device supports memory to memory scatter-gather transfers.
-
-  - Even though a plain memcpy can look like a particular case of a
-    scatter-gather transfer, with a single chunk to copy, it's a distinct
-    transaction type in the mem2mem transfer case. This is because some very
-    simple devices might be able to do contiguous single-chunk memory copies,
-    but have no support for more complex SG transfers.
-
   - No matter what the overall size of the combined chunks for source and
     destination is, only as many bytes as the smallest of the two will be
     transmitted. That means the number and size of the scatter-gather buffers in
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e80feeea0e01..c741b6431958 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1153,13 +1153,6 @@ int dma_async_device_register(struct dma_device *device)
 		return -EIO;
 	}
 
-	if (dma_has_cap(DMA_MEMCPY_SG, device->cap_mask) && !device->device_prep_dma_memcpy_sg) {
-		dev_err(device->dev,
-			"Device claims capability %s, but op is not defined\n",
-			"DMA_MEMCPY_SG");
-		return -EIO;
-	}
-
 	if (dma_has_cap(DMA_XOR, device->cap_mask) && !device->device_prep_dma_xor) {
 		dev_err(device->dev,
 			"Device claims capability %s, but op is not defined\n",
diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index cd62bbb50e8b..6276934d4d2b 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -2127,126 +2127,6 @@ error:
 	return NULL;
 }
 
-/**
- * xilinx_cdma_prep_memcpy_sg - prepare descriptors for a memcpy_sg transaction
- * @dchan: DMA channel
- * @dst_sg: Destination scatter list
- * @dst_sg_len: Number of entries in destination scatter list
- * @src_sg: Source scatter list
- * @src_sg_len: Number of entries in source scatter list
- * @flags: transfer ack flags
- *
- * Return: Async transaction descriptor on success and NULL on failure
- */
-static struct dma_async_tx_descriptor *xilinx_cdma_prep_memcpy_sg(
-			struct dma_chan *dchan, struct scatterlist *dst_sg,
-			unsigned int dst_sg_len, struct scatterlist *src_sg,
-			unsigned int src_sg_len, unsigned long flags)
-{
-	struct xilinx_dma_chan *chan = to_xilinx_chan(dchan);
-	struct xilinx_dma_tx_descriptor *desc;
-	struct xilinx_cdma_tx_segment *segment, *prev = NULL;
-	struct xilinx_cdma_desc_hw *hw;
-	size_t len, dst_avail, src_avail;
-	dma_addr_t dma_dst, dma_src;
-
-	if (unlikely(dst_sg_len == 0 || src_sg_len == 0))
-		return NULL;
-
-	if (unlikely(!dst_sg  || !src_sg))
-		return NULL;
-
-	desc = xilinx_dma_alloc_tx_descriptor(chan);
-	if (!desc)
-		return NULL;
-
-	dma_async_tx_descriptor_init(&desc->async_tx, &chan->common);
-	desc->async_tx.tx_submit = xilinx_dma_tx_submit;
-
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-	/*
-	 * loop until there is either no more source or no more destination
-	 * scatterlist entry
-	 */
-	while (true) {
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, chan->xdev->max_buffer_len);
-		if (len == 0)
-			goto fetch;
-
-		/* Allocate the link descriptor from DMA pool */
-		segment = xilinx_cdma_alloc_tx_segment(chan);
-		if (!segment)
-			goto error;
-
-		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
-			dst_avail;
-		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
-			src_avail;
-		hw = &segment->hw;
-		hw->control = len;
-		hw->src_addr = dma_src;
-		hw->dest_addr = dma_dst;
-		if (chan->ext_addr) {
-			hw->src_addr_msb = upper_32_bits(dma_src);
-			hw->dest_addr_msb = upper_32_bits(dma_dst);
-		}
-
-		if (prev) {
-			prev->hw.next_desc = segment->phys;
-			if (chan->ext_addr)
-				prev->hw.next_desc_msb =
-					upper_32_bits(segment->phys);
-		}
-
-		prev = segment;
-		dst_avail -= len;
-		src_avail -= len;
-		list_add_tail(&segment->node, &desc->segments);
-
-fetch:
-		/* Fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-			if (dst_sg_len == 0)
-				break;
-			dst_sg = sg_next(dst_sg);
-			if (dst_sg == NULL)
-				break;
-			dst_sg_len--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-		/* Fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-			if (src_sg_len == 0)
-				break;
-			src_sg = sg_next(src_sg);
-			if (src_sg == NULL)
-				break;
-			src_sg_len--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	if (list_empty(&desc->segments)) {
-		dev_err(chan->xdev->dev,
-			"%s: Zero-size SG transfer requested\n", __func__);
-		goto error;
-	}
-
-	/* Link the last hardware descriptor with the first. */
-	segment = list_first_entry(&desc->segments,
-				struct xilinx_cdma_tx_segment, node);
-	desc->async_tx.phys = segment->phys;
-	prev->hw.next_desc = segment->phys;
-
-	return &desc->async_tx;
-
-error:
-	xilinx_dma_free_tx_descriptor(chan, desc);
-	return NULL;
-}
-
 /**
  * xilinx_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
  * @dchan: DMA channel
@@ -3240,9 +3120,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
 	} else if (xdev->dma_config->dmatype == XDMA_TYPE_CDMA) {
 		dma_cap_set(DMA_MEMCPY, xdev->common.cap_mask);
-		dma_cap_set(DMA_MEMCPY_SG, xdev->common.cap_mask);
 		xdev->common.device_prep_dma_memcpy = xilinx_cdma_prep_memcpy;
-		xdev->common.device_prep_dma_memcpy_sg = xilinx_cdma_prep_memcpy_sg;
 		/* Residue calculation is supported by only AXI DMA and CDMA */
 		xdev->common.residue_granularity =
 					  DMA_RESIDUE_GRANULARITY_SEGMENT;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b46b88e6aa0d..c923f4e60f24 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -50,7 +50,6 @@ enum dma_status {
  */
 enum dma_transaction_type {
 	DMA_MEMCPY,
-	DMA_MEMCPY_SG,
 	DMA_XOR,
 	DMA_PQ,
 	DMA_XOR_VAL,
@@ -887,11 +886,6 @@ struct dma_device {
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t src,
 		size_t len, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy_sg)(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_xor)(
 		struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
 		unsigned int src_cnt, size_t len, unsigned long flags);
@@ -1060,20 +1054,6 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
-static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy_sg(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags)
-{
-	if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy_sg)
-		return NULL;
-
-	return chan->device->device_prep_dma_memcpy_sg(chan, dst_sg, dst_nents,
-						       src_sg, src_nents,
-						       flags);
-}
-
 static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan,
 		enum dma_desc_metadata_mode mode)
 {
-- 
cgit v1.2.3


From 8da443b1a4036b5863fd2ec7e0251164b704c726 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 14 Jun 2022 11:05:34 +0200
Subject: tty/vt: consolemap: rename struct vc_data::vc_uni_pagedir*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As a follow-up to the commit 4173f018aae1 (tty/vt: consolemap: rename
and document struct uni_pagedir), rename also the members of struct
vc_data. I.e. pagedir -> pagedict. And while touching all the places,
remove also the unnecessary vc_ prefix.

Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220614090537.15557-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/consolemap.c             | 46 ++++++++++++++++-----------------
 drivers/tty/vt/vt.c                     |  8 +++---
 drivers/usb/misc/sisusbvga/sisusb_con.c |  2 +-
 drivers/video/console/vgacon.c          |  8 +++---
 drivers/video/fbdev/core/fbcon.c        |  8 +++---
 include/linux/console_struct.h          |  4 +--
 6 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 3d0e10dac6d9..16d0d8f04f0e 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -296,7 +296,7 @@ u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode)
 	if (glyph >= MAX_GLYPH)
 		return 0;
 
-	p = *conp->vc_uni_pagedir_loc;
+	p = *conp->uni_pagedict_loc;
 	if (!p)
 		return glyph;
 
@@ -323,7 +323,7 @@ static void update_user_maps(void)
 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
 		if (!vc_cons_allocated(i))
 			continue;
-		p = *vc_cons[i].d->vc_uni_pagedir_loc;
+		p = *vc_cons[i].d->uni_pagedict_loc;
 		if (p && p != q) {
 			set_inverse_transl(vc_cons[i].d, p, USER_MAP);
 			set_inverse_trans_unicode(p);
@@ -445,10 +445,10 @@ void con_free_unimap(struct vc_data *vc)
 {
 	struct uni_pagedict *p;
 
-	p = *vc->vc_uni_pagedir_loc;
+	p = *vc->uni_pagedict_loc;
 	if (!p)
 		return;
-	*vc->vc_uni_pagedir_loc = NULL;
+	*vc->uni_pagedict_loc = NULL;
 	if (--p->refcount)
 		return;
 	con_release_unimap(p);
@@ -463,7 +463,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1)
 	for (cons = 0; cons < MAX_NR_CONSOLES; cons++) {
 		if (!vc_cons_allocated(cons))
 			continue;
-		dict2 = *vc_cons[cons].d->vc_uni_pagedir_loc;
+		dict2 = *vc_cons[cons].d->uni_pagedict_loc;
 		if (!dict2 || dict2 == dict1 || dict2->sum != dict1->sum)
 			continue;
 		for (d = 0; d < UNI_DIRS; d++) {
@@ -487,7 +487,7 @@ static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1)
 		}
 		if (d == UNI_DIRS) {
 			dict2->refcount++;
-			*conp->vc_uni_pagedir_loc = dict2;
+			*conp->uni_pagedict_loc = dict2;
 			con_release_unimap(dict1);
 			kfree(dict1);
 			return 1;
@@ -531,14 +531,14 @@ con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos)
 
 static int con_allocate_new(struct vc_data *vc)
 {
-	struct uni_pagedict *new, *old = *vc->vc_uni_pagedir_loc;
+	struct uni_pagedict *new, *old = *vc->uni_pagedict_loc;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
 	new->refcount = 1;
-	*vc->vc_uni_pagedir_loc = new;
+	*vc->uni_pagedict_loc = new;
 
 	if (old)
 		old->refcount--;
@@ -549,7 +549,7 @@ static int con_allocate_new(struct vc_data *vc)
 /* Caller must hold the lock */
 static int con_do_clear_unimap(struct vc_data *vc)
 {
-	struct uni_pagedict *old = *vc->vc_uni_pagedir_loc;
+	struct uni_pagedict *old = *vc->uni_pagedict_loc;
 
 	if (!old || old->refcount > 1)
 		return con_allocate_new(vc);
@@ -583,7 +583,7 @@ static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc,
 	if (ret)
 		return ERR_PTR(ret);
 
-	new = *vc->vc_uni_pagedir_loc;
+	new = *vc->uni_pagedict_loc;
 
 	/*
 	 * uni_pgdir is a 32*32*64 table with rows allocated when its first
@@ -616,7 +616,7 @@ static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc,
 				ret = con_insert_unipair(new, uni, row[g]);
 				if (ret) {
 					old->refcount++;
-					*vc->vc_uni_pagedir_loc = old;
+					*vc->uni_pagedict_loc = old;
 					con_release_unimap(new);
 					kfree(new);
 					return ERR_PTR(ret);
@@ -644,7 +644,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list)
 	console_lock();
 
 	/* Save original vc_unipagdir_loc in case we allocate a new one */
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	if (!dict) {
 		err = -EINVAL;
 		goto out_unlock;
@@ -704,12 +704,12 @@ int con_set_default_unimap(struct vc_data *vc)
 	u16 *dfont;
 
 	if (dflt) {
-		dict = *vc->vc_uni_pagedir_loc;
+		dict = *vc->uni_pagedict_loc;
 		if (dict == dflt)
 			return 0;
 
 		dflt->refcount++;
-		*vc->vc_uni_pagedir_loc = dflt;
+		*vc->uni_pagedict_loc = dflt;
 		if (dict && !--dict->refcount) {
 			con_release_unimap(dict);
 			kfree(dict);
@@ -723,7 +723,7 @@ int con_set_default_unimap(struct vc_data *vc)
 	if (err)
 		return err;
 
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	dfont = dfont_unitable;
 
 	for (fontpos = 0; fontpos < 256U; fontpos++)
@@ -734,7 +734,7 @@ int con_set_default_unimap(struct vc_data *vc)
 		}
 
 	if (con_unify_unimap(vc, dict)) {
-		dflt = *vc->vc_uni_pagedir_loc;
+		dflt = *vc->uni_pagedict_loc;
 		return err;
 	}
 
@@ -757,14 +757,14 @@ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc)
 {
 	struct uni_pagedict *src;
 
-	if (!*src_vc->vc_uni_pagedir_loc)
+	if (!*src_vc->uni_pagedict_loc)
 		return -EINVAL;
-	if (*dst_vc->vc_uni_pagedir_loc == *src_vc->vc_uni_pagedir_loc)
+	if (*dst_vc->uni_pagedict_loc == *src_vc->uni_pagedict_loc)
 		return 0;
 	con_free_unimap(dst_vc);
-	src = *src_vc->vc_uni_pagedir_loc;
+	src = *src_vc->uni_pagedict_loc;
 	src->refcount++;
-	*dst_vc->vc_uni_pagedir_loc = src;
+	*dst_vc->uni_pagedict_loc = src;
 	return 0;
 }
 EXPORT_SYMBOL(con_copy_unimap);
@@ -791,7 +791,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct,
 	console_lock();
 
 	ect = 0;
-	dict = *vc->vc_uni_pagedir_loc;
+	dict = *vc->uni_pagedict_loc;
 	if (!dict)
 		goto unlock;
 
@@ -873,7 +873,7 @@ int conv_uni_to_pc(struct vc_data *conp, long ucs)
 	else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE)
 		return ucs & UNI_DIRECT_MASK;
 
-	dict = *conp->vc_uni_pagedir_loc;
+	dict = *conp->uni_pagedict_loc;
 	if (!dict)
 		return -3;
 
@@ -903,7 +903,7 @@ console_map_init(void)
 	int i;
 
 	for (i = 0; i < MAX_NR_CONSOLES; i++)
-		if (vc_cons_allocated(i) && !*vc_cons[i].d->vc_uni_pagedir_loc)
+		if (vc_cons_allocated(i) && !*vc_cons[i].d->uni_pagedict_loc)
 			con_set_default_unimap(vc_cons[i].d);
 }
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index c718b0d01e3d..1899b8a5d73e 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1063,10 +1063,10 @@ static void visual_init(struct vc_data *vc, int num, int init)
 	__module_get(vc->vc_sw->owner);
 	vc->vc_num = num;
 	vc->vc_display_fg = &master_display_fg;
-	if (vc->vc_uni_pagedir_loc)
+	if (vc->uni_pagedict_loc)
 		con_free_unimap(vc);
-	vc->vc_uni_pagedir_loc = &vc->vc_uni_pagedir;
-	vc->vc_uni_pagedir = NULL;
+	vc->uni_pagedict_loc = &vc->uni_pagedict;
+	vc->uni_pagedict = NULL;
 	vc->vc_hi_font_mask = 0;
 	vc->vc_complement_mask = 0;
 	vc->vc_can_do_color = 0;
@@ -1136,7 +1136,7 @@ int vc_allocate(unsigned int currcons)	/* return 0 on success */
 
 	visual_init(vc, currcons, 1);
 
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_set_default_unimap(vc);
 
 	err = -EINVAL;
diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c
index dfa0d5ce6012..fcb95fb639e0 100644
--- a/drivers/usb/misc/sisusbvga/sisusb_con.c
+++ b/drivers/usb/misc/sisusbvga/sisusb_con.c
@@ -248,7 +248,7 @@ sisusbcon_init(struct vc_data *c, int init)
 	 */
 	kref_get(&sisusb->kref);
 
-	if (!*c->vc_uni_pagedir_loc)
+	if (!*c->uni_pagedict_loc)
 		con_set_default_unimap(c);
 
 	mutex_unlock(&sisusb->lock);
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 058a78b8dbcf..fcdf017e2665 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -367,10 +367,10 @@ static void vgacon_init(struct vc_data *c, int init)
 	c->vc_complement_mask = 0x7700;
 	if (vga_512_chars)
 		c->vc_hi_font_mask = 0x0800;
-	p = *c->vc_uni_pagedir_loc;
-	if (c->vc_uni_pagedir_loc != &vgacon_uni_pagedir) {
+	p = *c->uni_pagedict_loc;
+	if (c->uni_pagedict_loc != &vgacon_uni_pagedir) {
 		con_free_unimap(c);
-		c->vc_uni_pagedir_loc = &vgacon_uni_pagedir;
+		c->uni_pagedict_loc = &vgacon_uni_pagedir;
 		vgacon_refcount++;
 	}
 	if (!vgacon_uni_pagedir && p)
@@ -392,7 +392,7 @@ static void vgacon_deinit(struct vc_data *c)
 
 	if (!--vgacon_refcount)
 		con_free_unimap(c);
-	c->vc_uni_pagedir_loc = &c->vc_uni_pagedir;
+	c->uni_pagedict_loc = &c->uni_pagedict;
 	con_set_default_unimap(c);
 }
 
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index c4e91715ef00..b10236fe5886 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -1058,9 +1058,9 @@ static void fbcon_init(struct vc_data *vc, int init)
 			vc->vc_complement_mask <<= 1;
 	}
 
-	if (!*svc->vc_uni_pagedir_loc)
+	if (!*svc->uni_pagedict_loc)
 		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_copy_unimap(vc, svc);
 
 	ops = info->fbcon_par;
@@ -1382,9 +1382,9 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
 			vc->vc_complement_mask <<= 1;
 	}
 
-	if (!*svc->vc_uni_pagedir_loc)
+	if (!*svc->uni_pagedict_loc)
 		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
+	if (!*vc->uni_pagedict_loc)
 		con_copy_unimap(vc, svc);
 
 	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index f75033f0277f..1518568aaf0f 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -157,8 +157,8 @@ struct vc_data {
 	unsigned int	vc_bell_duration;	/* Console bell duration */
 	unsigned short	vc_cur_blink_ms;	/* Cursor blink duration */
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
-	struct uni_pagedict *vc_uni_pagedir;
-	struct uni_pagedict **vc_uni_pagedir_loc; /* [!] Location of uni_pagedict variable for this console */
+	struct uni_pagedict *uni_pagedict;
+	struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */
 	struct uni_screen *vc_uni_screen;	/* unicode screen content */
 	/* additional information is in vt_kern.h */
 };
-- 
cgit v1.2.3


From ab24a01b276508dc884761bcb8e2759c36702377 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Tue, 14 Jun 2022 10:50:54 +0300
Subject: tty: Add closing marker into comment in tty_ldisc.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The closing `` is missing. Add it.

Fixes: 6bb6fa6908eb ("tty: Implement lookahead to process XON/XOFF timely")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/9bc6d45d-48c8-519-1646-78ba22505b1f@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 33678e1936f6..ede6f2157f32 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -187,7 +187,7 @@ int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
  *	function for automatic flow control.
  *
  * @lookahead_buf: [DRV] ``void ()(struct tty_struct *tty,
- *			const unsigned char *cp, const char *fp, int count)
+ *			const unsigned char *cp, const char *fp, int count)``
  *
  *	This function is called by the low-level tty driver for characters
  *	not eaten by ->receive_buf() or ->receive_buf2(). It is useful for
-- 
cgit v1.2.3


From f9008285bb69e4713918a665250ab2d356b731ba Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 13 Jun 2022 14:39:05 +0300
Subject: serial: Drop timeout from uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 31f6bd7fad3b ("serial: Store character timing information
to uart_port"), per frame timing information is available on uart_port.
Uart port's timeout can be derived from frame_time by multiplying with
fifosize.

Most callers of uart_poll_timeout are not made under port's lock. To be
on the safe side, make sure frame_time is only accessed once. As
fifo_size is effectively a constant, it shouldn't cause any issues.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220613113905.22962-1-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst |  5 +++--
 drivers/tty/serial/mux.c                   |  6 ------
 drivers/tty/serial/serial_core.c           | 25 ++++++++++---------------
 include/linux/serial_core.h                | 16 ++++++++++++++--
 4 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index 7ef83fd3917b..1e7ab4142d49 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -422,8 +422,9 @@ Other functions
 ---------------
 
 uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
+	Update the frame timing information according to the number of bits,
+	parity, stop bits and baud rate. The FIFO drain timeout is derived
+	from the frame timing information.
 
 	Locking: caller is expected to take port->lock
 
diff --git a/drivers/tty/serial/mux.c b/drivers/tty/serial/mux.c
index 643dfbcc43f9..0ba0f4d9459d 100644
--- a/drivers/tty/serial/mux.c
+++ b/drivers/tty/serial/mux.c
@@ -481,12 +481,6 @@ static int __init mux_probe(struct parisc_device *dev)
 		port->line	= port_cnt;
 		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_MUX_CONSOLE);
 
-		/* The port->timeout needs to match what is present in
-		 * uart_wait_until_sent in serial_core.c.  Otherwise
-		 * the time spent in msleep_interruptable will be very
-		 * long, causing the appearance of a console hang.
-		 */
-		port->timeout   = HZ / 50;
 		spin_lock_init(&port->lock);
 
 		status = uart_add_one_port(&mux_driver, port);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 1368b0ef7d7f..75ece750bedc 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -327,13 +327,14 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 }
 
 /**
- *	uart_update_timeout - update per-port FIFO timeout.
+ *	uart_update_timeout - update per-port frame timing information.
  *	@port:  uart_port structure describing the port
  *	@cflag: termios cflag value
  *	@baud:  speed of the port
  *
- *	Set the port FIFO timeout value.  The @cflag value should
- *	reflect the actual hardware settings.
+ *	Set the port frame timing information from which the FIFO timeout
+ *	value is derived. The @cflag value should reflect the actual hardware
+ *	settings.
  */
 void
 uart_update_timeout(struct uart_port *port, unsigned int cflag,
@@ -343,13 +344,6 @@ uart_update_timeout(struct uart_port *port, unsigned int cflag,
 	u64 frame_time;
 
 	frame_time = (u64)size * NSEC_PER_SEC;
-	size *= port->fifosize;
-
-	/*
-	 * Figure the timeout to send the above number of bits.
-	 * Add .02 seconds of slop
-	 */
-	port->timeout = (HZ * size) / baud + HZ/50;
 	port->frame_time = DIV64_U64_ROUND_UP(frame_time, baud);
 }
 EXPORT_SYMBOL(uart_update_timeout);
@@ -1698,7 +1692,7 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
-	unsigned long char_time, expire;
+	unsigned long char_time, expire, fifo_timeout;
 
 	port = uart_port_ref(state);
 	if (!port)
@@ -1728,12 +1722,13 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 		 * amount of time to send the entire FIFO, it probably won't
 		 * ever clear.  This assumes the UART isn't doing flow
 		 * control, which is currently the case.  Hence, if it ever
-		 * takes longer than port->timeout, this is probably due to a
+		 * takes longer than FIFO timeout, this is probably due to a
 		 * UART bug of some kind.  So, we clamp the timeout parameter at
-		 * 2*port->timeout.
+		 * 2 * FIFO timeout.
 		 */
-		if (timeout == 0 || timeout > 2 * port->timeout)
-			timeout = 2 * port->timeout;
+		fifo_timeout = uart_fifo_timeout(port);
+		if (timeout == 0 || timeout > 2 * fifo_timeout)
+			timeout = 2 * fifo_timeout;
 	}
 
 	expire = jiffies + timeout;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 8032ffa741ed..faaf2372c60d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -232,7 +232,6 @@ struct uart_port {
 
 	int			hw_stopped;		/* sw-assisted CTS flow state */
 	unsigned int		mctrl;			/* current modem ctrl settings */
-	unsigned int		timeout;		/* character-based timeout */
 	unsigned int		frame_time;		/* frame timing in ns */
 	unsigned int		type;			/* port type */
 	const struct uart_ops	*ops;
@@ -335,10 +334,23 @@ unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios
 				unsigned int max);
 unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);
 
+/*
+ * Calculates FIFO drain time.
+ */
+static inline unsigned long uart_fifo_timeout(struct uart_port *port)
+{
+	u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize;
+
+	/* Add .02 seconds of slop */
+	fifo_timeout += 20 * NSEC_PER_MSEC;
+
+	return max(nsecs_to_jiffies(fifo_timeout), 1UL);
+}
+
 /* Base timer interval for polling */
 static inline int uart_poll_timeout(struct uart_port *port)
 {
-	int timeout = port->timeout;
+	int timeout = uart_fifo_timeout(port);
 
 	return timeout > 6 ? (timeout / 2 - 2) : 1;
 }
-- 
cgit v1.2.3


From eb47b59afb7e46c952d7b03884245364990d4910 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:23 +0300
Subject: serial: Convert SERIAL_XMIT_SIZE to UART_XMIT_SIZE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both UART_XMIT_SIZE and SERIAL_XMIT_SIZE are defined. Make them all
UART_XMIT_SIZE.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-6-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/amiserial.c         | 18 +++++++++---------
 drivers/tty/mips_ejtag_fdc.c    |  2 +-
 drivers/tty/serial/meson_uart.c |  2 +-
 drivers/tty/serial/owl-uart.c   |  2 +-
 drivers/tty/serial/rda-uart.c   |  2 +-
 include/linux/serial.h          |  6 ------
 6 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index afb2d373dd47..5458e2b1c125 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -51,6 +51,7 @@
 #include <linux/seq_file.h>
 #include <linux/serial.h>
 #include <linux/serial_reg.h>
+#include <linux/serial_core.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
@@ -283,12 +284,12 @@ static void transmit_chars(struct serial_state *info)
 
 	amiga_custom.serdat = info->xmit.buf[info->xmit.tail++] | 0x100;
 	mb();
-	info->xmit.tail = info->xmit.tail & (SERIAL_XMIT_SIZE-1);
+	info->xmit.tail = info->xmit.tail & (UART_XMIT_SIZE - 1);
 	info->icount.tx++;
 
 	if (CIRC_CNT(info->xmit.head,
 		     info->xmit.tail,
-		     SERIAL_XMIT_SIZE) < WAKEUP_CHARS)
+		     UART_XMIT_SIZE) < WAKEUP_CHARS)
 		tty_wakeup(info->tport.tty);
 
 #ifdef SERIAL_DEBUG_INTR
@@ -708,13 +709,13 @@ static int rs_put_char(struct tty_struct *tty, unsigned char ch)
 	local_irq_save(flags);
 	if (CIRC_SPACE(info->xmit.head,
 		       info->xmit.tail,
-		       SERIAL_XMIT_SIZE) == 0) {
+		       UART_XMIT_SIZE) == 0) {
 		local_irq_restore(flags);
 		return 0;
 	}
 
 	info->xmit.buf[info->xmit.head++] = ch;
-	info->xmit.head &= SERIAL_XMIT_SIZE-1;
+	info->xmit.head &= UART_XMIT_SIZE - 1;
 	local_irq_restore(flags);
 	return 1;
 }
@@ -753,15 +754,14 @@ static int rs_write(struct tty_struct * tty, const unsigned char *buf, int count
 	while (1) {
 		c = CIRC_SPACE_TO_END(info->xmit.head,
 				      info->xmit.tail,
-				      SERIAL_XMIT_SIZE);
+				      UART_XMIT_SIZE);
 		if (count < c)
 			c = count;
 		if (c <= 0) {
 			break;
 		}
 		memcpy(info->xmit.buf + info->xmit.head, buf, c);
-		info->xmit.head = ((info->xmit.head + c) &
-				   (SERIAL_XMIT_SIZE-1));
+		info->xmit.head = (info->xmit.head + c) & (UART_XMIT_SIZE - 1);
 		buf += c;
 		count -= c;
 		ret += c;
@@ -788,14 +788,14 @@ static unsigned int rs_write_room(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
-	return CIRC_SPACE(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE);
+	return CIRC_SPACE(info->xmit.head, info->xmit.tail, UART_XMIT_SIZE);
 }
 
 static unsigned int rs_chars_in_buffer(struct tty_struct *tty)
 {
 	struct serial_state *info = tty->driver_data;
 
-	return CIRC_CNT(info->xmit.head, info->xmit.tail, SERIAL_XMIT_SIZE);
+	return CIRC_CNT(info->xmit.head, info->xmit.tail, UART_XMIT_SIZE);
 }
 
 static void rs_flush_buffer(struct tty_struct *tty)
diff --git a/drivers/tty/mips_ejtag_fdc.c b/drivers/tty/mips_ejtag_fdc.c
index 49907427a165..e81701a66429 100644
--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -916,7 +916,7 @@ static int mips_ejtag_fdc_tty_probe(struct mips_cdmm_device *dev)
 	mips_ejtag_fdc_write(priv, REG_FDCFG, cfg);
 
 	/* Make each port's xmit FIFO big enough to fill FDC TX FIFO */
-	priv->xmit_size = min(tx_fifo * 4, (unsigned int)SERIAL_XMIT_SIZE);
+	priv->xmit_size = min(tx_fifo * 4, (unsigned int)UART_XMIT_SIZE);
 
 	driver = tty_alloc_driver(NUM_TTY_CHANNELS, TTY_DRIVER_REAL_RAW);
 	if (IS_ERR(driver))
diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c
index 4869c0059c98..6c8db19fd572 100644
--- a/drivers/tty/serial/meson_uart.c
+++ b/drivers/tty/serial/meson_uart.c
@@ -162,7 +162,7 @@ static void meson_uart_start_tx(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		writel(ch, port->membase + AML_UART_WFIFO);
-		xmit->tail = (xmit->tail+1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c
index 44d20e5a7dd3..888e17e3f25f 100644
--- a/drivers/tty/serial/owl-uart.c
+++ b/drivers/tty/serial/owl-uart.c
@@ -201,7 +201,7 @@ static void owl_uart_send_chars(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		owl_uart_write(port, ch, OWL_UART_TXDAT);
-		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c
index f556b4955f59..feb2054aba37 100644
--- a/drivers/tty/serial/rda-uart.c
+++ b/drivers/tty/serial/rda-uart.c
@@ -353,7 +353,7 @@ static void rda_uart_send_chars(struct uart_port *port)
 
 		ch = xmit->buf[xmit->tail];
 		rda_uart_write(port, ch, RDA_UART_RXTX_BUFFER);
-		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		port->icount.tx++;
 	}
 
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 0b8b7d7c8f33..70a9866e4abb 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -9,7 +9,6 @@
 #ifndef _LINUX_SERIAL_H
 #define _LINUX_SERIAL_H
 
-#include <asm/page.h>
 #include <uapi/linux/serial.h>
 
 /* Helper for dealing with UART_LCR_WLEN* defines */
@@ -25,11 +24,6 @@ struct async_icount {
 	__u32	buf_overrun;
 };
 
-/*
- * The size of the serial xmit buffer is 1 page, or 4096 bytes
- */
-#define SERIAL_XMIT_SIZE PAGE_SIZE
-
 #include <linux/compiler.h>
 
 #endif /* _LINUX_SERIAL_H */
-- 
cgit v1.2.3


From 34619de1b8cb52afa90bbeb3b4fbad34c28f19cf Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:54:24 +0300
Subject: serial: Consolidate BOTH_EMPTY use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per file BOTH_EMPTY defines are littering our source code here and
there. Define once in serial.h and create helper for the check
too.

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624205424.12686-7-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/ath79/early_printk.c           |  9 +++++----
 drivers/accessibility/speakup/serialio.h |  3 +--
 drivers/tty/serial/8250/8250_early.c     |  4 +---
 drivers/tty/serial/8250/8250_port.c      | 12 +++++-------
 drivers/tty/serial/omap-serial.c         |  7 +++----
 drivers/tty/serial/pch_uart.c            |  7 +++----
 drivers/tty/serial/pxa.c                 |  5 ++---
 drivers/tty/serial/sunsu.c               |  4 +---
 drivers/tty/serial/vr41xx_siu.c          |  4 +---
 include/linux/serial.h                   |  9 +++++++++
 10 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/ath79/early_printk.c b/arch/mips/ath79/early_printk.c
index 8751d067f98f..f6d02b425a10 100644
--- a/arch/mips/ath79/early_printk.c
+++ b/arch/mips/ath79/early_printk.c
@@ -8,6 +8,7 @@
 
 #include <linux/io.h>
 #include <linux/errno.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <asm/addrspace.h>
 #include <asm/setup.h>
@@ -29,15 +30,15 @@ static inline void prom_putchar_wait(void __iomem *reg, u32 mask, u32 val)
 	} while (1);
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void prom_putchar_ar71xx(char ch)
 {
 	void __iomem *base = (void __iomem *)(KSEG1ADDR(AR71XX_UART_BASE));
 
-	prom_putchar_wait(base + UART_LSR * 4, BOTH_EMPTY, BOTH_EMPTY);
+	prom_putchar_wait(base + UART_LSR * 4, UART_LSR_BOTH_EMPTY,
+			  UART_LSR_BOTH_EMPTY);
 	__raw_writel((unsigned char)ch, base + UART_TX * 4);
-	prom_putchar_wait(base + UART_LSR * 4, BOTH_EMPTY, BOTH_EMPTY);
+	prom_putchar_wait(base + UART_LSR * 4, UART_LSR_BOTH_EMPTY,
+			  UART_LSR_BOTH_EMPTY);
 }
 
 static void prom_putchar_ar933x(char ch)
diff --git a/drivers/accessibility/speakup/serialio.h b/drivers/accessibility/speakup/serialio.h
index 6f8f86f161bb..b4f9a1925b81 100644
--- a/drivers/accessibility/speakup/serialio.h
+++ b/drivers/accessibility/speakup/serialio.h
@@ -33,9 +33,8 @@ struct old_serial_port {
 #define NUM_DISABLE_TIMEOUTS 3
 /* buffer timeout in ms */
 #define SPK_TIMEOUT 100
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
 
 #define spk_serial_tx_busy() \
-	((inb(speakup_info.port_tts + UART_LSR) & BOTH_EMPTY) != BOTH_EMPTY)
+	(!uart_lsr_tx_empty(inb(speakup_info.port_tts + UART_LSR)))
 
 #endif
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index e52585064565..f271becfc46c 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -84,8 +84,6 @@ static void serial8250_early_out(struct uart_port *port, int offset, int value)
 	}
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void serial_putc(struct uart_port *port, unsigned char c)
 {
 	unsigned int status;
@@ -94,7 +92,7 @@ static void serial_putc(struct uart_port *port, unsigned char c)
 
 	for (;;) {
 		status = serial8250_early_in(port, UART_LSR);
-		if ((status & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(status))
 			break;
 		cpu_relax();
 	}
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 1311b00f8194..55b252954a92 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -50,8 +50,6 @@
 #define DEBUG_AUTOCONF(fmt...)	do { } while (0)
 #endif
 
-#define BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  * Here we define the default xmit fifo size used for each type of UART.
  */
@@ -1843,7 +1841,7 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 		if (uart_circ_empty(xmit))
 			break;
 		if ((up->capabilities & UART_CAP_HFIFO) &&
-		    (serial_in(up, UART_LSR) & BOTH_EMPTY) != BOTH_EMPTY)
+		    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
 			break;
 		/* The BCM2835 MINI UART THRE bit is really a not-full bit. */
 		if ((up->capabilities & UART_CAP_MINI) &&
@@ -2003,7 +2001,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 
 	serial8250_rpm_put(up);
 
-	return (lsr & BOTH_EMPTY) == BOTH_EMPTY ? TIOCSER_TEMT : 0;
+	return uart_lsr_tx_empty(lsr) ? TIOCSER_TEMT : 0;
 }
 
 unsigned int serial8250_do_get_mctrl(struct uart_port *port)
@@ -2151,7 +2149,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
 	else
 		serial_port_out(port, UART_IER, 0);
 
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 	/*
 	 *	Send the character out.
 	 */
@@ -2161,7 +2159,7 @@ static void serial8250_put_poll_char(struct uart_port *port,
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 	serial_port_out(port, UART_IER, ier);
 	serial8250_rpm_put(up);
 }
@@ -3431,7 +3429,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(up, BOTH_EMPTY);
+	wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
 
 	if (em485) {
 		mdelay(port->rs485.delay_rts_after_send);
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 98622c35d896..52cb1a68b053 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/console.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
@@ -1102,8 +1103,6 @@ serial_omap_type(struct uart_port *port)
 	return up->name;
 }
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 static void __maybe_unused wait_for_xmitr(struct uart_omap_port *up)
 {
 	unsigned int status, tmout = 10000;
@@ -1118,7 +1117,7 @@ static void __maybe_unused wait_for_xmitr(struct uart_omap_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
@@ -1186,7 +1185,7 @@ static void omap_serial_early_putc(struct uart_port *port, unsigned char c)
 
 	for (;;) {
 		status = omap_serial_early_in(port, UART_LSR);
-		if ((status & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(status))
 			break;
 		cpu_relax();
 	}
diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 3b26524d48e3..8a9065e4a903 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -3,6 +3,7 @@
  *Copyright (C) 2011 LAPIS Semiconductor Co., Ltd.
  */
 #include <linux/kernel.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -189,8 +190,6 @@ enum {
 #define PCH_UART_HAL_LOOP		(PCH_UART_MCR_LOOP)
 #define PCH_UART_HAL_AFE		(PCH_UART_MCR_AFE)
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 #define DEFAULT_UARTCLK   1843200 /*   1.8432 MHz */
 #define CMITC_UARTCLK   192000000 /* 192.0000 MHz */
 #define FRI2_64_UARTCLK  64000000 /*  64.0000 MHz */
@@ -1516,7 +1515,7 @@ static void pch_uart_put_poll_char(struct uart_port *port,
 	 * Finally, wait for transmitter to become empty
 	 * and restore the IER
 	 */
-	wait_for_xmitr(priv, BOTH_EMPTY);
+	wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY);
 	iowrite8(ier, priv->membase + UART_IER);
 }
 #endif /* CONFIG_CONSOLE_POLL */
@@ -1602,7 +1601,7 @@ pch_console_write(struct console *co, const char *s, unsigned int count)
 	 *	Finally, wait for transmitter to become empty
 	 *	and restore the IER
 	 */
-	wait_for_xmitr(priv, BOTH_EMPTY);
+	wait_for_xmitr(priv, UART_LSR_BOTH_EMPTY);
 	iowrite8(ier, priv->membase + UART_IER);
 
 	if (port_locked)
diff --git a/drivers/tty/serial/pxa.c b/drivers/tty/serial/pxa.c
index e80ba8e10407..9309ffd87c8e 100644
--- a/drivers/tty/serial/pxa.c
+++ b/drivers/tty/serial/pxa.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
+#include <linux/serial.h>
 #include <linux/serial_reg.h>
 #include <linux/circ_buf.h>
 #include <linux/delay.h>
@@ -575,8 +576,6 @@ static struct uart_driver serial_pxa_reg;
 
 #ifdef CONFIG_SERIAL_PXA_CONSOLE
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  *	Wait for transmitter & holding register to empty
  */
@@ -594,7 +593,7 @@ static void wait_for_xmitr(struct uart_pxa_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c
index fff50b5b82eb..84d545e5a8c7 100644
--- a/drivers/tty/serial/sunsu.c
+++ b/drivers/tty/serial/sunsu.c
@@ -1249,8 +1249,6 @@ static int sunsu_kbd_ms_init(struct uart_sunsu_port *up)
 
 #ifdef CONFIG_SERIAL_SUNSU_CONSOLE
 
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
 /*
  *	Wait for transmitter & holding register to empty
  */
@@ -1268,7 +1266,7 @@ static void wait_for_xmitr(struct uart_sunsu_port *up)
 		if (--tmout == 0)
 			break;
 		udelay(1);
-	} while ((status & BOTH_EMPTY) != BOTH_EMPTY);
+	} while (!uart_lsr_tx_empty(status));
 
 	/* Wait up to 1s for flow control if necessary */
 	if (up->port.flags & UPF_CONS_FLOW) {
diff --git a/drivers/tty/serial/vr41xx_siu.c b/drivers/tty/serial/vr41xx_siu.c
index e0bf003ca3a1..1ba689a81abd 100644
--- a/drivers/tty/serial/vr41xx_siu.c
+++ b/drivers/tty/serial/vr41xx_siu.c
@@ -703,8 +703,6 @@ static int siu_init_ports(struct platform_device *pdev)
 
 #ifdef CONFIG_SERIAL_VR41XX_CONSOLE
 
-#define BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
-
 static void wait_for_xmitr(struct uart_port *port)
 {
 	int timeout = 10000;
@@ -715,7 +713,7 @@ static void wait_for_xmitr(struct uart_port *port)
 		if (lsr & UART_LSR_BI)
 			lsr_break_flag[port->line] = UART_LSR_BI;
 
-		if ((lsr & BOTH_EMPTY) == BOTH_EMPTY)
+		if (uart_lsr_tx_empty(lsr))
 			break;
 	} while (timeout-- > 0);
 
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 70a9866e4abb..3d6fe3ef92cf 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -10,10 +10,19 @@
 #define _LINUX_SERIAL_H
 
 #include <uapi/linux/serial.h>
+#include <uapi/linux/serial_reg.h>
 
 /* Helper for dealing with UART_LCR_WLEN* defines */
 #define UART_LCR_WLEN(x)	((x) - 5)
 
+/* FIFO and shifting register empty */
+#define UART_LSR_BOTH_EMPTY	(UART_LSR_TEMT | UART_LSR_THRE)
+
+static inline bool uart_lsr_tx_empty(u16 lsr)
+{
+	return (lsr & UART_LSR_BOTH_EMPTY) == UART_LSR_BOTH_EMPTY;
+}
+
 /*
  * Counters of the input lines (CTS, DSR, RI, CD) interrupts
  */
-- 
cgit v1.2.3


From f8ba5680a56be696b3f4343ed0a591abab807da4 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:05 +0300
Subject: serial: 8250: make saved LSR larger
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DW flags address received as BIT(8) in LSR. In order to not lose that
on read, enlarge lsr_saved_flags to u16.

Adjust lsr/status variables and related call chains to use u16.
Technically, some of these type conversion would not be needed but it
doesn't hurt to be consistent.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h         |  4 ++--
 drivers/tty/serial/8250/8250_exar.c    |  2 +-
 drivers/tty/serial/8250/8250_fsl.c     |  2 +-
 drivers/tty/serial/8250/8250_ingenic.c |  2 +-
 drivers/tty/serial/8250/8250_omap.c    |  7 +++----
 drivers/tty/serial/8250/8250_port.c    | 17 +++++++++--------
 include/linux/serial_8250.h            |  6 +++---
 7 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index b120da57c61f..0ff5688ba90c 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -133,9 +133,9 @@ static inline void serial_out(struct uart_8250_port *up, int offset, int value)
  *
  *	Returns LSR value or'ed with the preserved flags (if any).
  */
-static inline unsigned int serial_lsr_in(struct uart_8250_port *up)
+static inline u16 serial_lsr_in(struct uart_8250_port *up)
 {
-	unsigned int lsr = up->lsr_saved_flags;
+	u16 lsr = up->lsr_saved_flags;
 
 	lsr |= serial_in(up, UART_LSR);
 	up->lsr_saved_flags = lsr & LSR_SAVE_FLAGS;
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 528779b40049..3d999eec4087 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -195,11 +195,11 @@ static int xr17v35x_startup(struct uart_port *port)
 
 static void exar_shutdown(struct uart_port *port)
 {
-	unsigned char lsr;
 	bool tx_complete = false;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	struct circ_buf *xmit = &port->state->xmit;
 	int i = 0;
+	u16 lsr;
 
 	do {
 		lsr = serial_in(up, UART_LSR);
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index 9c01c531349d..fd4005fcd0d6 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -25,8 +25,8 @@
 
 int fsl8250_handle_irq(struct uart_port *port)
 {
-	unsigned char lsr, orig_lsr;
 	unsigned long flags;
+	u16 lsr, orig_lsr;
 	unsigned int iir;
 	struct uart_8250_port *up = up_to_u8250p(port);
 
diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
index cff91aa03f29..2b2f5d8d24b9 100644
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -54,7 +54,7 @@ static void early_out(struct uart_port *port, int offset, uint8_t value)
 
 static void ingenic_early_console_putc(struct uart_port *port, unsigned char c)
 {
-	uint8_t lsr;
+	u16 lsr;
 
 	do {
 		lsr = early_in(port, UART_LSR);
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index ac8bfa042391..0dcecbbc3967 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -1115,8 +1115,7 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
 	return omap_8250_rx_dma(up);
 }
 
-static unsigned char omap_8250_handle_rx_dma(struct uart_8250_port *up,
-					     u8 iir, unsigned char status)
+static u16 omap_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir, u16 status)
 {
 	if ((status & (UART_LSR_DR | UART_LSR_BI)) &&
 	    (iir & UART_IIR_RDI)) {
@@ -1130,7 +1129,7 @@ static unsigned char omap_8250_handle_rx_dma(struct uart_8250_port *up,
 }
 
 static void am654_8250_handle_rx_dma(struct uart_8250_port *up, u8 iir,
-				     unsigned char status)
+				     u16 status)
 {
 	/*
 	 * Queue a new transfer if FIFO has data.
@@ -1164,7 +1163,7 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	struct omap8250_priv *priv = up->port.private_data;
-	unsigned char status;
+	u16 status;
 	u8 iir;
 
 	serial8250_rpm_get(up);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 55b252954a92..c8ae0e8376d4 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1502,7 +1502,7 @@ static inline void __stop_tx(struct uart_8250_port *p)
 	struct uart_8250_em485 *em485 = p->em485;
 
 	if (em485) {
-		unsigned char lsr = serial_lsr_in(p);
+		u16 lsr = serial_lsr_in(p);
 		u64 stop_delay = 0;
 
 		p->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
@@ -1563,7 +1563,7 @@ static inline void __start_tx(struct uart_port *port)
 
 	if (serial8250_set_THRI(up)) {
 		if (up->bugs & UART_BUG_TXEN) {
-			unsigned char lsr = serial_lsr_in(up);
+			u16 lsr = serial_lsr_in(up);
 
 			if (lsr & UART_LSR_THRE)
 				serial8250_tx_chars(up);
@@ -1716,7 +1716,7 @@ static void serial8250_enable_ms(struct uart_port *port)
 	serial8250_rpm_put(up);
 }
 
-void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr)
+void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
 {
 	struct uart_port *port = &up->port;
 	unsigned char ch;
@@ -1785,7 +1785,7 @@ EXPORT_SYMBOL_GPL(serial8250_read_char);
  * (such as THRE) because the LSR value might come from an already consumed
  * character.
  */
-unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr)
+u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
 {
 	struct uart_port *port = &up->port;
 	int max_count = 256;
@@ -1905,10 +1905,10 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
  */
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 {
-	unsigned char status;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	bool skip_rx = false;
 	unsigned long flags;
+	u16 status;
 
 	if (iir & UART_IIR_NO_INT)
 		return 0;
@@ -1991,7 +1991,7 @@ static unsigned int serial8250_tx_empty(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned long flags;
-	unsigned int lsr;
+	u16 lsr;
 
 	serial8250_rpm_get(up);
 
@@ -2114,8 +2114,8 @@ static void wait_for_xmitr(struct uart_8250_port *up, int bits)
 static int serial8250_get_poll_char(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
-	unsigned char lsr;
 	int status;
+	u16 lsr;
 
 	serial8250_rpm_get(up);
 
@@ -2170,8 +2170,9 @@ int serial8250_do_startup(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned long flags;
-	unsigned char lsr, iir;
+	unsigned char iir;
 	int retval;
+	u16 lsr;
 
 	if (!port->fifosize)
 		port->fifosize = uart_config[port->type].fifo_size;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index ff84a3ed10ea..4565f25ba9a2 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -119,7 +119,7 @@ struct uart_8250_port {
 	 * be immediately processed.
 	 */
 #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS
-	unsigned char		lsr_saved_flags;
+	u16			lsr_saved_flags;
 #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
 	unsigned char		msr_saved_flags;
 
@@ -170,8 +170,8 @@ extern void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
 				      unsigned int quot_frac);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
-unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr);
-void serial8250_read_char(struct uart_8250_port *up, unsigned char lsr);
+u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr);
+void serial8250_read_char(struct uart_8250_port *up, u16 lsr);
 void serial8250_tx_chars(struct uart_8250_port *up);
 unsigned int serial8250_modem_status(struct uart_8250_port *up);
 void serial8250_init_port(struct uart_8250_port *up);
-- 
cgit v1.2.3


From 507bd6fbaaefcb8dd89bd00baddf00b439d30c51 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:06 +0300
Subject: serial: 8250: create lsr_save_mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow drivers to alter LSR save mask.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-3-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h      | 2 +-
 drivers/tty/serial/8250/8250_core.c | 4 ++++
 drivers/tty/serial/8250/8250_dw.c   | 2 +-
 include/linux/serial_8250.h         | 1 +
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 0ff5688ba90c..5cc967fe3b59 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -138,7 +138,7 @@ static inline u16 serial_lsr_in(struct uart_8250_port *up)
 	u16 lsr = up->lsr_saved_flags;
 
 	lsr |= serial_in(up, UART_LSR);
-	up->lsr_saved_flags = lsr & LSR_SAVE_FLAGS;
+	up->lsr_saved_flags = lsr & up->lsr_save_mask;
 
 	return lsr;
 }
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 90ddc8924811..57e86133af4f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1007,6 +1007,7 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 		uart->port.rs485	= up->port.rs485;
 		uart->rs485_start_tx	= up->rs485_start_tx;
 		uart->rs485_stop_tx	= up->rs485_stop_tx;
+		uart->lsr_save_mask	= up->lsr_save_mask;
 		uart->dma		= up->dma;
 
 		/* Take tx_loadsz from fifosize if it wasn't set separately */
@@ -1094,6 +1095,9 @@ int serial8250_register_8250_port(const struct uart_8250_port *up)
 			ret = 0;
 		}
 
+		if (!uart->lsr_save_mask)
+			uart->lsr_save_mask = LSR_SAVE_FLAGS;	/* Use default LSR mask */
+
 		/* Initialise interrupt backoff work if required */
 		if (up->overrun_backoff_time_ms > 0) {
 			uart->overrun_backoff_time_ms =
diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c
index 4cc69bb612ab..167a691c7b19 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -129,7 +129,7 @@ static void dw8250_tx_wait_empty(struct uart_port *p)
 
 	while (tries--) {
 		lsr = readb (p->membase + (UART_LSR << p->regshift));
-		up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
+		up->lsr_saved_flags |= lsr & up->lsr_save_mask;
 
 		if (lsr & UART_LSR_TEMT)
 			break;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 4565f25ba9a2..8c7b793aa4d7 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -120,6 +120,7 @@ struct uart_8250_port {
 	 */
 #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS
 	u16			lsr_saved_flags;
+	u16			lsr_save_mask;
 #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
 	unsigned char		msr_saved_flags;
 
-- 
cgit v1.2.3


From ae50bb2752836277ae15aa4e9d99074d6d947946 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 24 Jun 2022 23:42:08 +0300
Subject: serial: take termios_rwsem for ->rs485_config() & pass termios as
 param
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to alter ADDRB within ->rs485_config(), take termios_rwsem
before calling ->rs485_config() and pass termios.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220624204210.11112-5-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250.h         |  3 ++-
 drivers/tty/serial/8250/8250_dwlib.c   |  3 ++-
 drivers/tty/serial/8250/8250_exar.c    |  9 +++++----
 drivers/tty/serial/8250/8250_fintek.c  |  2 +-
 drivers/tty/serial/8250/8250_lpc18xx.c |  2 +-
 drivers/tty/serial/8250/8250_pci.c     |  2 +-
 drivers/tty/serial/8250/8250_port.c    |  3 ++-
 drivers/tty/serial/amba-pl011.c        |  2 +-
 drivers/tty/serial/ar933x_uart.c       |  2 +-
 drivers/tty/serial/atmel_serial.c      |  2 +-
 drivers/tty/serial/fsl_lpuart.c        |  4 ++--
 drivers/tty/serial/imx.c               |  2 +-
 drivers/tty/serial/max310x.c           |  2 +-
 drivers/tty/serial/mcf.c               |  3 ++-
 drivers/tty/serial/omap-serial.c       |  3 ++-
 drivers/tty/serial/sc16is7xx.c         |  2 +-
 drivers/tty/serial/serial_core.c       | 14 ++++++++++----
 drivers/tty/serial/stm32-usart.c       |  2 +-
 include/linux/serial_core.h            |  1 +
 19 files changed, 38 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 5cc967fe3b59..287153d32536 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -203,7 +203,8 @@ void serial8250_rpm_put(struct uart_8250_port *p);
 void serial8250_rpm_get_tx(struct uart_8250_port *p);
 void serial8250_rpm_put_tx(struct uart_8250_port *p);
 
-int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485);
+int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485);
 void serial8250_em485_start_tx(struct uart_8250_port *p);
 void serial8250_em485_stop_tx(struct uart_8250_port *p);
 void serial8250_em485_destroy(struct uart_8250_port *p);
diff --git a/drivers/tty/serial/8250/8250_dwlib.c b/drivers/tty/serial/8250/8250_dwlib.c
index c83e7eaf3877..d1ff3daeb0ba 100644
--- a/drivers/tty/serial/8250/8250_dwlib.c
+++ b/drivers/tty/serial/8250/8250_dwlib.c
@@ -85,7 +85,8 @@ void dw8250_do_set_termios(struct uart_port *p, struct ktermios *termios, struct
 }
 EXPORT_SYMBOL_GPL(dw8250_do_set_termios);
 
-static int dw8250_rs485_config(struct uart_port *p, struct serial_rs485 *rs485)
+static int dw8250_rs485_config(struct uart_port *p, struct ktermios *termios,
+			       struct serial_rs485 *rs485)
 {
 	u32 tcr;
 
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 3d999eec4087..f5344cfe303c 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -112,7 +112,8 @@
 struct exar8250;
 
 struct exar8250_platform {
-	int (*rs485_config)(struct uart_port *, struct serial_rs485 *);
+	int (*rs485_config)(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485);
 	const struct serial_rs485 *rs485_supported;
 	int (*register_gpio)(struct pci_dev *, struct uart_8250_port *);
 	void (*unregister_gpio)(struct uart_8250_port *);
@@ -409,7 +410,7 @@ static void xr17v35x_unregister_gpio(struct uart_8250_port *port)
 	port->port.private_data = NULL;
 }
 
-static int generic_rs485_config(struct uart_port *port,
+static int generic_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	bool is_rs485 = !!(rs485->flags & SER_RS485_ENABLED);
@@ -441,7 +442,7 @@ static const struct exar8250_platform exar8250_default_platform = {
 	.rs485_supported = &generic_rs485_supported,
 };
 
-static int iot2040_rs485_config(struct uart_port *port,
+static int iot2040_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	bool is_rs485 = !!(rs485->flags & SER_RS485_ENABLED);
@@ -471,7 +472,7 @@ static int iot2040_rs485_config(struct uart_port *port,
 	value |= mode;
 	writeb(value, p + UART_EXAR_MPIOLVL_7_0);
 
-	return generic_rs485_config(port, rs485);
+	return generic_rs485_config(port, termios, rs485);
 }
 
 static const struct serial_rs485 iot2040_rs485_supported = {
diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c
index 1fb86c73786c..eea693f5b577 100644
--- a/drivers/tty/serial/8250/8250_fintek.c
+++ b/drivers/tty/serial/8250/8250_fintek.c
@@ -191,7 +191,7 @@ static int fintek_8250_get_ldn_range(struct fintek_8250 *pdata, int *min,
 	return -ENODEV;
 }
 
-static int fintek_8250_rs485_config(struct uart_port *port,
+static int fintek_8250_rs485_config(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485)
 {
 	uint8_t config = 0;
diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c
index 3a1cb51cbc91..d7cb3bb52069 100644
--- a/drivers/tty/serial/8250/8250_lpc18xx.c
+++ b/drivers/tty/serial/8250/8250_lpc18xx.c
@@ -32,7 +32,7 @@ struct lpc18xx_uart_data {
 	int line;
 };
 
-static int lpc18xx_rs485_config(struct uart_port *port,
+static int lpc18xx_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index b6d71268aa7d..d31d2350a9db 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1553,7 +1553,7 @@ pci_brcm_trumanage_setup(struct serial_private *priv,
 #define FINTEK_RTS_INVERT		BIT(5)
 
 /* We should do proper H/W transceiver setting before change to RS485 mode */
-static int pci_fintek_rs485_config(struct uart_port *port,
+static int pci_fintek_rs485_config(struct uart_port *port, struct ktermios *termios,
 			       struct serial_rs485 *rs485)
 {
 	struct pci_dev *pci_dev = to_pci_dev(port->dev);
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index c8ae0e8376d4..d4337d8346c8 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -664,7 +664,8 @@ EXPORT_SYMBOL_GPL(serial8250_em485_supported);
  * if the uart is incapable of driving RTS as a Transmit Enable signal in
  * hardware, relying on software emulation instead.
  */
-int serial8250_em485_config(struct uart_port *port, struct serial_rs485 *rs485)
+int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index eccd66625d25..c8f52945a4aa 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2197,7 +2197,7 @@ static int pl011_verify_port(struct uart_port *port, struct serial_struct *ser)
 	return ret;
 }
 
-static int pl011_rs485_config(struct uart_port *port,
+static int pl011_rs485_config(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485)
 {
 	struct uart_amba_port *uap =
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index ab2c5b2a1ce8..b73ce13683db 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -580,7 +580,7 @@ static const struct uart_ops ar933x_uart_ops = {
 	.verify_port	= ar933x_uart_verify_port,
 };
 
-static int ar933x_config_rs485(struct uart_port *port,
+static int ar933x_config_rs485(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485conf)
 {
 	struct ar933x_uart_port *up =
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index 3a94c2bdda72..bc6004679585 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -283,7 +283,7 @@ static void atmel_tasklet_schedule(struct atmel_uart_port *atmel_port,
 }
 
 /* Enable or disable the rs485 support */
-static int atmel_config_rs485(struct uart_port *port,
+static int atmel_config_rs485(struct uart_port *port, struct ktermios *termios,
 			      struct serial_rs485 *rs485conf)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index d35414cb3e4e..8fe0494d4057 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -1355,7 +1355,7 @@ static void lpuart_dma_rx_free(struct uart_port *port)
 	sport->dma_rx_cookie = -EINVAL;
 }
 
-static int lpuart_config_rs485(struct uart_port *port,
+static int lpuart_config_rs485(struct uart_port *port, struct ktermios *termios,
 			struct serial_rs485 *rs485)
 {
 	struct lpuart_port *sport = container_of(port,
@@ -1385,7 +1385,7 @@ static int lpuart_config_rs485(struct uart_port *port,
 	return 0;
 }
 
-static int lpuart32_config_rs485(struct uart_port *port,
+static int lpuart32_config_rs485(struct uart_port *port, struct ktermios *termios,
 			struct serial_rs485 *rs485)
 {
 	struct lpuart_port *sport = container_of(port,
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index f4edde54175f..3457006cea3f 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -1907,7 +1907,7 @@ static void imx_uart_poll_put_char(struct uart_port *port, unsigned char c)
 #endif
 
 /* called with port.lock taken and irqs off or from .probe without locking */
-static int imx_uart_rs485_config(struct uart_port *port,
+static int imx_uart_rs485_config(struct uart_port *port, struct ktermios *termios,
 				 struct serial_rs485 *rs485conf)
 {
 	struct imx_port *sport = (struct imx_port *)port;
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 4915a786e315..e162bfb44080 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1036,7 +1036,7 @@ static void max310x_rs_proc(struct work_struct *ws)
 			MAX310X_MODE2_ECHOSUPR_BIT, mode2);
 }
 
-static int max310x_rs485_config(struct uart_port *port,
+static int max310x_rs485_config(struct uart_port *port, struct ktermios *termios,
 				struct serial_rs485 *rs485)
 {
 	struct max310x_one *one = to_max310x_port(port);
diff --git a/drivers/tty/serial/mcf.c b/drivers/tty/serial/mcf.c
index 036f178e3d66..73c5287b8e5e 100644
--- a/drivers/tty/serial/mcf.c
+++ b/drivers/tty/serial/mcf.c
@@ -431,7 +431,8 @@ static int mcf_verify_port(struct uart_port *port, struct serial_struct *ser)
 /****************************************************************************/
 
 /* Enable or disable the RS485 support */
-static int mcf_config_rs485(struct uart_port *port, struct serial_rs485 *rs485)
+static int mcf_config_rs485(struct uart_port *port, struct ktermios *termios,
+			    struct serial_rs485 *rs485)
 {
 	unsigned char mr1, mr2;
 
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 52cb1a68b053..196bae704f85 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1324,7 +1324,8 @@ static inline void serial_omap_add_console_port(struct uart_omap_port *up)
 
 /* Enable or disable the rs485 support */
 static int
-serial_omap_config_rs485(struct uart_port *port, struct serial_rs485 *rs485)
+serial_omap_config_rs485(struct uart_port *port, struct ktermios *termios,
+			 struct serial_rs485 *rs485)
 {
 	struct uart_omap_port *up = to_uart_omap_port(port);
 	unsigned int mode;
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 2ceecaa4a478..8cb92a3b3fb8 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1127,7 +1127,7 @@ static void sc16is7xx_set_termios(struct uart_port *port,
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 
-static int sc16is7xx_config_rs485(struct uart_port *port,
+static int sc16is7xx_config_rs485(struct uart_port *port, struct ktermios *termios,
 				  struct serial_rs485 *rs485)
 {
 	struct sc16is7xx_port *s = dev_get_drvdata(port->dev);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 75ece750bedc..2529153c8979 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1353,7 +1353,7 @@ int uart_rs485_config(struct uart_port *port)
 
 	uart_sanitize_serial_rs485(port, rs485);
 
-	ret = port->rs485_config(port, rs485);
+	ret = port->rs485_config(port, NULL, rs485);
 	if (ret)
 		memset(rs485, 0, sizeof(*rs485));
 
@@ -1377,7 +1377,7 @@ static int uart_get_rs485_config(struct uart_port *port,
 	return 0;
 }
 
-static int uart_set_rs485_config(struct uart_port *port,
+static int uart_set_rs485_config(struct tty_struct *tty, struct uart_port *port,
 			 struct serial_rs485 __user *rs485_user)
 {
 	struct serial_rs485 rs485;
@@ -1396,7 +1396,7 @@ static int uart_set_rs485_config(struct uart_port *port,
 	uart_sanitize_serial_rs485(port, &rs485);
 
 	spin_lock_irqsave(&port->lock, flags);
-	ret = port->rs485_config(port, &rs485);
+	ret = port->rs485_config(port, &tty->termios, &rs485);
 	if (!ret)
 		port->rs485 = rs485;
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -1505,6 +1505,10 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	if (ret != -ENOIOCTLCMD)
 		goto out;
 
+	/* rs485_config requires more locking than others */
+	if (cmd == TIOCGRS485)
+		down_write(&tty->termios_rwsem);
+
 	mutex_lock(&port->mutex);
 	uport = uart_port_check(state);
 
@@ -1528,7 +1532,7 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 		break;
 
 	case TIOCSRS485:
-		ret = uart_set_rs485_config(uport, uarg);
+		ret = uart_set_rs485_config(tty, uport, uarg);
 		break;
 
 	case TIOCSISO7816:
@@ -1545,6 +1549,8 @@ uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
 	}
 out_up:
 	mutex_unlock(&port->mutex);
+	if (cmd == TIOCGRS485)
+		up_write(&tty->termios_rwsem);
 out:
 	return ret;
 }
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index db3dd9731ee1..13992e64a7df 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -97,7 +97,7 @@ static void stm32_usart_config_reg_rs485(u32 *cr1, u32 *cr3, u32 delay_ADE,
 	*cr1 |= rs485_deat_dedt;
 }
 
-static int stm32_usart_config_rs485(struct uart_port *port,
+static int stm32_usart_config_rs485(struct uart_port *port, struct ktermios *termios,
 				    struct serial_rs485 *rs485conf)
 {
 	struct stm32_port *stm32_port = to_stm32_port(port);
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index faaf2372c60d..b7b86ee3cb12 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -133,6 +133,7 @@ struct uart_port {
 				      unsigned int old);
 	void			(*handle_break)(struct uart_port *);
 	int			(*rs485_config)(struct uart_port *,
+						struct ktermios *termios,
 						struct serial_rs485 *rs485);
 	int			(*iso7816_config)(struct uart_port *,
 						  struct serial_iso7816 *iso7816);
-- 
cgit v1.2.3


From d1877e639bc6bf1c3131eda3f9ede73f8da96c22 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 8 Jun 2022 12:55:13 -0600
Subject: vfio: de-extern-ify function prototypes

The use of 'extern' in function prototypes has been disrecommended in
the kernel coding style for several years now, remove them from all vfio
related files so contributors no longer need to decide between style and
consistency.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/165471414407.203056.474032786990662279.stgit@omen
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst | 10 ++--
 drivers/s390/cio/vfio_ccw_cp.h                    | 12 ++--
 drivers/s390/cio/vfio_ccw_private.h               |  6 +-
 drivers/vfio/fsl-mc/vfio_fsl_mc_private.h         |  2 +-
 drivers/vfio/platform/vfio_platform_private.h     | 21 ++++---
 include/linux/vfio.h                              | 70 +++++++++++------------
 include/linux/vfio_pci_core.h                     | 65 +++++++++++----------
 7 files changed, 91 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index eded8719180f..1c57815619fd 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -114,11 +114,11 @@ to register and unregister itself with the core driver:
 
 * Register::
 
-    extern int  mdev_register_driver(struct mdev_driver *drv);
+    int mdev_register_driver(struct mdev_driver *drv);
 
 * Unregister::
 
-    extern void mdev_unregister_driver(struct mdev_driver *drv);
+    void mdev_unregister_driver(struct mdev_driver *drv);
 
 The mediated bus driver's probe function should create a vfio_device on top of
 the mdev_device and connect it to an appropriate implementation of
@@ -127,8 +127,8 @@ vfio_device_ops.
 When a driver wants to add the GUID creation sysfs to an existing device it has
 probe'd to then it should call::
 
-	extern int  mdev_register_device(struct device *dev,
-	                                 struct mdev_driver *mdev_driver);
+    int mdev_register_device(struct device *dev,
+                             struct mdev_driver *mdev_driver);
 
 This will provide the 'mdev_supported_types/XX/create' files which can then be
 used to trigger the creation of a mdev_device. The created mdev_device will be
@@ -136,7 +136,7 @@ attached to the specified driver.
 
 When the driver needs to remove itself it calls::
 
-	extern void mdev_unregister_device(struct device *dev);
+    void mdev_unregister_device(struct device *dev);
 
 Which will unbind and destroy all the created mdevs and remove the sysfs files.
 
diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h
index e4c436199b4c..3194d887e08e 100644
--- a/drivers/s390/cio/vfio_ccw_cp.h
+++ b/drivers/s390/cio/vfio_ccw_cp.h
@@ -41,11 +41,11 @@ struct channel_program {
 	struct ccw1 *guest_cp;
 };
 
-extern int cp_init(struct channel_program *cp, union orb *orb);
-extern void cp_free(struct channel_program *cp);
-extern int cp_prefetch(struct channel_program *cp);
-extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
-extern void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
-extern bool cp_iova_pinned(struct channel_program *cp, u64 iova);
+int cp_init(struct channel_program *cp, union orb *orb);
+void cp_free(struct channel_program *cp);
+int cp_prefetch(struct channel_program *cp);
+union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm);
+void cp_update_scsw(struct channel_program *cp, union scsw *scsw);
+bool cp_iova_pinned(struct channel_program *cp, u64 iova);
 
 #endif
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index 7272eb788612..b7163bac8cc7 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -119,10 +119,10 @@ struct vfio_ccw_private {
 	struct work_struct	crw_work;
 } __aligned(8);
 
-extern int vfio_ccw_mdev_reg(struct subchannel *sch);
-extern void vfio_ccw_mdev_unreg(struct subchannel *sch);
+int vfio_ccw_mdev_reg(struct subchannel *sch);
+void vfio_ccw_mdev_unreg(struct subchannel *sch);
 
-extern int vfio_ccw_sch_quiesce(struct subchannel *sch);
+int vfio_ccw_sch_quiesce(struct subchannel *sch);
 
 extern struct mdev_driver vfio_ccw_mdev_driver;
 
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
index 4ad63ececb91..7a29f572f93d 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
@@ -39,7 +39,7 @@ struct vfio_fsl_mc_device {
 	struct vfio_fsl_mc_irq      *mc_irqs;
 };
 
-extern int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
 			       u32 flags, unsigned int index,
 			       unsigned int start, unsigned int count,
 			       void *data);
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 520d2a8e8375..691b43f4b2b2 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -78,21 +78,20 @@ struct vfio_platform_reset_node {
 	vfio_platform_reset_fn_t of_reset;
 };
 
-extern int vfio_platform_probe_common(struct vfio_platform_device *vdev,
-				      struct device *dev);
+int vfio_platform_probe_common(struct vfio_platform_device *vdev,
+			       struct device *dev);
 void vfio_platform_remove_common(struct vfio_platform_device *vdev);
 
-extern int vfio_platform_irq_init(struct vfio_platform_device *vdev);
-extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
+int vfio_platform_irq_init(struct vfio_platform_device *vdev);
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev);
 
-extern int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
-					uint32_t flags, unsigned index,
-					unsigned start, unsigned count,
-					void *data);
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+				 uint32_t flags, unsigned index,
+				 unsigned start, unsigned count, void *data);
 
-extern void __vfio_platform_register_reset(struct vfio_platform_reset_node *n);
-extern void vfio_platform_unregister_reset(const char *compat,
-					   vfio_platform_reset_fn_t fn);
+void __vfio_platform_register_reset(struct vfio_platform_reset_node *n);
+void vfio_platform_unregister_reset(const char *compat,
+				    vfio_platform_reset_fn_t fn);
 #define vfio_platform_register_reset(__compat, __reset)		\
 static struct vfio_platform_reset_node __reset ## _node = {	\
 	.owner = THIS_MODULE,					\
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index aa888cc51757..49580fa2073a 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -140,19 +140,19 @@ int vfio_mig_get_next_state(struct vfio_device *device,
 /*
  * External user API
  */
-extern struct iommu_group *vfio_file_iommu_group(struct file *file);
-extern bool vfio_file_enforced_coherent(struct file *file);
-extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
-extern bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
+struct iommu_group *vfio_file_iommu_group(struct file *file);
+bool vfio_file_enforced_coherent(struct file *file);
+void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
+bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
-extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
-			  int npage, int prot, unsigned long *phys_pfn);
-extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-			    int npage);
-extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
-		       void *data, size_t len, bool write);
+int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		   int npage, int prot, unsigned long *phys_pfn);
+int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		     int npage);
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
+		void *data, size_t len, bool write);
 
 /* each type has independent events */
 enum vfio_notify_type {
@@ -162,13 +162,13 @@ enum vfio_notify_type {
 /* events for VFIO_IOMMU_NOTIFY */
 #define VFIO_IOMMU_NOTIFY_DMA_UNMAP	BIT(0)
 
-extern int vfio_register_notifier(struct vfio_device *device,
-				  enum vfio_notify_type type,
-				  unsigned long *required_events,
-				  struct notifier_block *nb);
-extern int vfio_unregister_notifier(struct vfio_device *device,
-				    enum vfio_notify_type type,
-				    struct notifier_block *nb);
+int vfio_register_notifier(struct vfio_device *device,
+			   enum vfio_notify_type type,
+			   unsigned long *required_events,
+			   struct notifier_block *nb);
+int vfio_unregister_notifier(struct vfio_device *device,
+			     enum vfio_notify_type type,
+			     struct notifier_block *nb);
 
 
 /*
@@ -178,25 +178,24 @@ struct vfio_info_cap {
 	struct vfio_info_cap_header *buf;
 	size_t size;
 };
-extern struct vfio_info_cap_header *vfio_info_cap_add(
-		struct vfio_info_cap *caps, size_t size, u16 id, u16 version);
-extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
+struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
+					       size_t size, u16 id,
+					       u16 version);
+void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
 
-extern int vfio_info_add_capability(struct vfio_info_cap *caps,
-				    struct vfio_info_cap_header *cap,
-				    size_t size);
+int vfio_info_add_capability(struct vfio_info_cap *caps,
+			     struct vfio_info_cap_header *cap, size_t size);
 
-extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
-					      int num_irqs, int max_irq_type,
-					      size_t *data_size);
+int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
+				       int num_irqs, int max_irq_type,
+				       size_t *data_size);
 
 struct pci_dev;
 #if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
-extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
-extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
-extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
-				       unsigned int cmd,
-				       unsigned long arg);
+void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
+long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd,
+				unsigned long arg);
 #else
 static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
 {
@@ -230,10 +229,9 @@ struct virqfd {
 	struct virqfd		**pvirqfd;
 };
 
-extern int vfio_virqfd_enable(void *opaque,
-			      int (*handler)(void *, void *),
-			      void (*thread)(void *, void *),
-			      void *data, struct virqfd **pvirqfd, int fd);
-extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
+int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
+		       void (*thread)(void *, void *), void *data,
+		       struct virqfd **pvirqfd, int fd);
+void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
 #endif /* VFIO_H */
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 23c176d4b073..22de2bce6394 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -147,23 +147,23 @@ struct vfio_pci_core_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
-extern void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
+void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
+void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
-				   uint32_t flags, unsigned index,
-				   unsigned start, unsigned count, void *data);
+int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev,
+			    uint32_t flags, unsigned index,
+			    unsigned start, unsigned count, void *data);
 
-extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
-				  char __user *buf, size_t count,
-				  loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
+			   char __user *buf, size_t count,
+			   loff_t *ppos, bool iswrite);
 
-extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			       size_t count, loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
 
 #ifdef CONFIG_VFIO_PCI_VGA
-extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
-			       size_t count, loff_t *ppos, bool iswrite);
+ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
+			size_t count, loff_t *ppos, bool iswrite);
 #else
 static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
 				      char __user *buf, size_t count,
@@ -173,32 +173,31 @@ static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
 }
 #endif
 
-extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
-			       uint64_t data, int count, int fd);
+long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
+			uint64_t data, int count, int fd);
 
-extern int vfio_pci_init_perm_bits(void);
-extern void vfio_pci_uninit_perm_bits(void);
+int vfio_pci_init_perm_bits(void);
+void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_config_init(struct vfio_pci_core_device *vdev);
-extern void vfio_config_free(struct vfio_pci_core_device *vdev);
+int vfio_config_init(struct vfio_pci_core_device *vdev);
+void vfio_config_free(struct vfio_pci_core_device *vdev);
 
-extern int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
-					unsigned int type, unsigned int subtype,
-					const struct vfio_pci_regops *ops,
-					size_t size, u32 flags, void *data);
+int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
+				 unsigned int type, unsigned int subtype,
+				 const struct vfio_pci_regops *ops,
+				 size_t size, u32 flags, void *data);
 
-extern int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
-				    pci_power_t state);
+int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev,
+			     pci_power_t state);
 
-extern bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device
-						    *vdev);
-extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
-extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
-					       u16 cmd);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
+void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev);
+u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev);
+void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
+					u16 cmd);
 
 #ifdef CONFIG_VFIO_PCI_IGD
-extern int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
+int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
 #else
 static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 {
@@ -207,8 +206,8 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 #endif
 
 #ifdef CONFIG_S390
-extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
-				       struct vfio_info_cap *caps);
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
+				struct vfio_info_cap *caps);
 #else
 static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 					      struct vfio_info_cap *caps)
-- 
cgit v1.2.3


From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Mon, 27 Jun 2022 09:00:26 +0300
Subject: docs: rename Documentation/vm to Documentation/mm

so it will be consistent with code mm directory and with
Documentation/admin-guide/mm and won't be confused with virtual machines.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Tested-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Wu XiangCheng <bobwxc@email.cn>
---
 Documentation/ABI/testing/sysfs-kernel-mm-ksm      |   2 +-
 Documentation/ABI/testing/sysfs-kernel-slab        |   4 +-
 Documentation/admin-guide/kernel-parameters.txt    |  10 +-
 Documentation/admin-guide/mm/concepts.rst          |   2 +-
 Documentation/admin-guide/mm/damon/index.rst       |   2 +-
 Documentation/admin-guide/mm/damon/reclaim.rst     |   2 +-
 Documentation/admin-guide/mm/damon/usage.rst       |   8 +-
 Documentation/admin-guide/sysctl/vm.rst            |   2 +-
 Documentation/core-api/index.rst                   |   2 +-
 Documentation/filesystems/proc.rst                 |   2 +-
 Documentation/index.rst                            |   2 +-
 Documentation/mm/active_mm.rst                     |  91 ++++
 Documentation/mm/arch_pgtable_helpers.rst          | 260 +++++++++
 Documentation/mm/balance.rst                       | 102 ++++
 Documentation/mm/bootmem.rst                       |   5 +
 Documentation/mm/damon/api.rst                     |  20 +
 Documentation/mm/damon/design.rst                  | 176 ++++++
 Documentation/mm/damon/faq.rst                     |  50 ++
 Documentation/mm/damon/index.rst                   |  29 +
 Documentation/mm/free_page_reporting.rst           |  40 ++
 Documentation/mm/frontswap.rst                     | 266 +++++++++
 Documentation/mm/highmem.rst                       | 167 ++++++
 Documentation/mm/hmm.rst                           | 452 ++++++++++++++++
 Documentation/mm/hugetlbfs_reserv.rst              | 596 +++++++++++++++++++++
 Documentation/mm/hwpoison.rst                      | 184 +++++++
 Documentation/mm/index.rst                         |  68 +++
 Documentation/mm/ksm.rst                           |  87 +++
 Documentation/mm/memory-model.rst                  | 177 ++++++
 Documentation/mm/mmu_notifier.rst                  |  99 ++++
 Documentation/mm/numa.rst                          | 150 ++++++
 Documentation/mm/oom.rst                           |   5 +
 Documentation/mm/overcommit-accounting.rst         |  88 +++
 Documentation/mm/page_allocation.rst               |   5 +
 Documentation/mm/page_cache.rst                    |   5 +
 Documentation/mm/page_frags.rst                    |  45 ++
 Documentation/mm/page_migration.rst                | 288 ++++++++++
 Documentation/mm/page_owner.rst                    | 196 +++++++
 Documentation/mm/page_reclaim.rst                  |   5 +
 Documentation/mm/page_table_check.rst              |  56 ++
 Documentation/mm/page_tables.rst                   |   5 +
 Documentation/mm/physical_memory.rst               |   5 +
 Documentation/mm/process_addrs.rst                 |   5 +
 Documentation/mm/remap_file_pages.rst              |  33 ++
 Documentation/mm/shmfs.rst                         |   5 +
 Documentation/mm/slab.rst                          |   5 +
 Documentation/mm/slub.rst                          | 452 ++++++++++++++++
 Documentation/mm/split_page_table_lock.rst         | 100 ++++
 Documentation/mm/swap.rst                          |   5 +
 Documentation/mm/transhuge.rst                     | 187 +++++++
 Documentation/mm/unevictable-lru.rst               | 554 +++++++++++++++++++
 Documentation/mm/vmalloc.rst                       |   5 +
 Documentation/mm/vmalloced-kernel-stacks.rst       | 153 ++++++
 Documentation/mm/vmemmap_dedup.rst                 | 223 ++++++++
 Documentation/mm/z3fold.rst                        |  30 ++
 Documentation/mm/zsmalloc.rst                      |  82 +++
 .../zh_CN/admin-guide/mm/damon/index.rst           |   2 +-
 .../zh_CN/admin-guide/mm/damon/reclaim.rst         |   2 +-
 .../zh_CN/admin-guide/mm/damon/usage.rst           |   8 +-
 .../translations/zh_CN/core-api/index.rst          |   2 +-
 Documentation/translations/zh_CN/index.rst         |   2 +-
 Documentation/translations/zh_CN/mm/active_mm.rst  |  85 +++
 Documentation/translations/zh_CN/mm/balance.rst    |  81 +++
 Documentation/translations/zh_CN/mm/damon/api.rst  |  32 ++
 .../translations/zh_CN/mm/damon/design.rst         | 140 +++++
 Documentation/translations/zh_CN/mm/damon/faq.rst  |  48 ++
 .../translations/zh_CN/mm/damon/index.rst          |  32 ++
 .../translations/zh_CN/mm/free_page_reporting.rst  |  38 ++
 Documentation/translations/zh_CN/mm/frontswap.rst  | 196 +++++++
 Documentation/translations/zh_CN/mm/highmem.rst    | 128 +++++
 Documentation/translations/zh_CN/mm/hmm.rst        | 361 +++++++++++++
 .../translations/zh_CN/mm/hugetlbfs_reserv.rst     | 436 +++++++++++++++
 Documentation/translations/zh_CN/mm/hwpoison.rst   | 166 ++++++
 Documentation/translations/zh_CN/mm/index.rst      |  54 ++
 Documentation/translations/zh_CN/mm/ksm.rst        |  70 +++
 .../translations/zh_CN/mm/memory-model.rst         | 135 +++++
 .../translations/zh_CN/mm/mmu_notifier.rst         |  97 ++++
 Documentation/translations/zh_CN/mm/numa.rst       | 101 ++++
 .../zh_CN/mm/overcommit-accounting.rst             |  86 +++
 Documentation/translations/zh_CN/mm/page_frags.rst |  38 ++
 Documentation/translations/zh_CN/mm/page_owner.rst | 116 ++++
 .../translations/zh_CN/mm/page_table_check.rst     |  56 ++
 .../translations/zh_CN/mm/remap_file_pages.rst     |  32 ++
 .../zh_CN/mm/split_page_table_lock.rst             |  96 ++++
 Documentation/translations/zh_CN/mm/z3fold.rst     |  31 ++
 Documentation/translations/zh_CN/mm/zsmalloc.rst   |  78 +++
 Documentation/translations/zh_CN/vm/active_mm.rst  |  85 ---
 Documentation/translations/zh_CN/vm/balance.rst    |  81 ---
 Documentation/translations/zh_CN/vm/damon/api.rst  |  32 --
 .../translations/zh_CN/vm/damon/design.rst         | 140 -----
 Documentation/translations/zh_CN/vm/damon/faq.rst  |  48 --
 .../translations/zh_CN/vm/damon/index.rst          |  33 --
 .../translations/zh_CN/vm/free_page_reporting.rst  |  38 --
 Documentation/translations/zh_CN/vm/frontswap.rst  | 196 -------
 Documentation/translations/zh_CN/vm/highmem.rst    | 128 -----
 Documentation/translations/zh_CN/vm/hmm.rst        | 361 -------------
 .../translations/zh_CN/vm/hugetlbfs_reserv.rst     | 436 ---------------
 Documentation/translations/zh_CN/vm/hwpoison.rst   | 166 ------
 Documentation/translations/zh_CN/vm/index.rst      |  54 --
 Documentation/translations/zh_CN/vm/ksm.rst        |  70 ---
 .../translations/zh_CN/vm/memory-model.rst         | 135 -----
 .../translations/zh_CN/vm/mmu_notifier.rst         |  97 ----
 Documentation/translations/zh_CN/vm/numa.rst       | 101 ----
 .../zh_CN/vm/overcommit-accounting.rst             |  86 ---
 Documentation/translations/zh_CN/vm/page_frags.rst |  38 --
 Documentation/translations/zh_CN/vm/page_owner.rst | 116 ----
 .../translations/zh_CN/vm/page_table_check.rst     |  56 --
 .../translations/zh_CN/vm/remap_file_pages.rst     |  32 --
 .../zh_CN/vm/split_page_table_lock.rst             |  96 ----
 Documentation/translations/zh_CN/vm/z3fold.rst     |  31 --
 Documentation/translations/zh_CN/vm/zsmalloc.rst   |  78 ---
 Documentation/translations/zh_TW/index.rst         |   2 +-
 Documentation/vm/.gitignore                        |   3 -
 Documentation/vm/active_mm.rst                     |  91 ----
 Documentation/vm/arch_pgtable_helpers.rst          | 260 ---------
 Documentation/vm/balance.rst                       | 102 ----
 Documentation/vm/bootmem.rst                       |   5 -
 Documentation/vm/damon/api.rst                     |  20 -
 Documentation/vm/damon/design.rst                  | 176 ------
 Documentation/vm/damon/faq.rst                     |  50 --
 Documentation/vm/damon/index.rst                   |  29 -
 Documentation/vm/free_page_reporting.rst           |  40 --
 Documentation/vm/frontswap.rst                     | 266 ---------
 Documentation/vm/highmem.rst                       | 167 ------
 Documentation/vm/hmm.rst                           | 452 ----------------
 Documentation/vm/hugetlbfs_reserv.rst              | 596 ---------------------
 Documentation/vm/hwpoison.rst                      | 184 -------
 Documentation/vm/index.rst                         |  68 ---
 Documentation/vm/ksm.rst                           |  87 ---
 Documentation/vm/memory-model.rst                  | 177 ------
 Documentation/vm/mmu_notifier.rst                  |  99 ----
 Documentation/vm/numa.rst                          | 150 ------
 Documentation/vm/oom.rst                           |   5 -
 Documentation/vm/overcommit-accounting.rst         |  88 ---
 Documentation/vm/page_allocation.rst               |   5 -
 Documentation/vm/page_cache.rst                    |   5 -
 Documentation/vm/page_frags.rst                    |  45 --
 Documentation/vm/page_migration.rst                | 288 ----------
 Documentation/vm/page_owner.rst                    | 196 -------
 Documentation/vm/page_reclaim.rst                  |   5 -
 Documentation/vm/page_table_check.rst              |  56 --
 Documentation/vm/page_tables.rst                   |   5 -
 Documentation/vm/physical_memory.rst               |   5 -
 Documentation/vm/process_addrs.rst                 |   5 -
 Documentation/vm/remap_file_pages.rst              |  33 --
 Documentation/vm/shmfs.rst                         |   5 -
 Documentation/vm/slab.rst                          |   5 -
 Documentation/vm/slub.rst                          | 452 ----------------
 Documentation/vm/split_page_table_lock.rst         | 100 ----
 Documentation/vm/swap.rst                          |   5 -
 Documentation/vm/transhuge.rst                     | 187 -------
 Documentation/vm/unevictable-lru.rst               | 554 -------------------
 Documentation/vm/vmalloc.rst                       |   5 -
 Documentation/vm/vmalloced-kernel-stacks.rst       | 153 ------
 Documentation/vm/vmemmap_dedup.rst                 | 223 --------
 Documentation/vm/z3fold.rst                        |  30 --
 Documentation/vm/zsmalloc.rst                      |  82 ---
 MAINTAINERS                                        |  12 +-
 arch/loongarch/Kconfig                             |   2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h       |   2 +-
 include/linux/hmm.h                                |   4 +-
 include/linux/memremap.h                           |   2 +-
 include/linux/mmu_notifier.h                       |   2 +-
 include/linux/sched/mm.h                           |   4 +-
 include/linux/swap.h                               |   2 +-
 mm/Kconfig                                         |   2 +-
 mm/debug_vm_pgtable.c                              |   2 +-
 mm/frontswap.c                                     |   2 +-
 mm/huge_memory.c                                   |   2 +-
 mm/hugetlb.c                                       |   6 +-
 mm/hugetlb_vmemmap.c                               |   2 +-
 mm/ksm.c                                           |   4 +-
 mm/mmap.c                                          |   2 +-
 mm/rmap.c                                          |   8 +-
 mm/sparse-vmemmap.c                                |   2 +-
 mm/util.c                                          |   2 +-
 tools/vm/page_owner_sort.c                         |   2 +-
 176 files changed, 8355 insertions(+), 8359 deletions(-)
 create mode 100644 Documentation/mm/active_mm.rst
 create mode 100644 Documentation/mm/arch_pgtable_helpers.rst
 create mode 100644 Documentation/mm/balance.rst
 create mode 100644 Documentation/mm/bootmem.rst
 create mode 100644 Documentation/mm/damon/api.rst
 create mode 100644 Documentation/mm/damon/design.rst
 create mode 100644 Documentation/mm/damon/faq.rst
 create mode 100644 Documentation/mm/damon/index.rst
 create mode 100644 Documentation/mm/free_page_reporting.rst
 create mode 100644 Documentation/mm/frontswap.rst
 create mode 100644 Documentation/mm/highmem.rst
 create mode 100644 Documentation/mm/hmm.rst
 create mode 100644 Documentation/mm/hugetlbfs_reserv.rst
 create mode 100644 Documentation/mm/hwpoison.rst
 create mode 100644 Documentation/mm/index.rst
 create mode 100644 Documentation/mm/ksm.rst
 create mode 100644 Documentation/mm/memory-model.rst
 create mode 100644 Documentation/mm/mmu_notifier.rst
 create mode 100644 Documentation/mm/numa.rst
 create mode 100644 Documentation/mm/oom.rst
 create mode 100644 Documentation/mm/overcommit-accounting.rst
 create mode 100644 Documentation/mm/page_allocation.rst
 create mode 100644 Documentation/mm/page_cache.rst
 create mode 100644 Documentation/mm/page_frags.rst
 create mode 100644 Documentation/mm/page_migration.rst
 create mode 100644 Documentation/mm/page_owner.rst
 create mode 100644 Documentation/mm/page_reclaim.rst
 create mode 100644 Documentation/mm/page_table_check.rst
 create mode 100644 Documentation/mm/page_tables.rst
 create mode 100644 Documentation/mm/physical_memory.rst
 create mode 100644 Documentation/mm/process_addrs.rst
 create mode 100644 Documentation/mm/remap_file_pages.rst
 create mode 100644 Documentation/mm/shmfs.rst
 create mode 100644 Documentation/mm/slab.rst
 create mode 100644 Documentation/mm/slub.rst
 create mode 100644 Documentation/mm/split_page_table_lock.rst
 create mode 100644 Documentation/mm/swap.rst
 create mode 100644 Documentation/mm/transhuge.rst
 create mode 100644 Documentation/mm/unevictable-lru.rst
 create mode 100644 Documentation/mm/vmalloc.rst
 create mode 100644 Documentation/mm/vmalloced-kernel-stacks.rst
 create mode 100644 Documentation/mm/vmemmap_dedup.rst
 create mode 100644 Documentation/mm/z3fold.rst
 create mode 100644 Documentation/mm/zsmalloc.rst
 create mode 100644 Documentation/translations/zh_CN/mm/active_mm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/balance.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/api.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/design.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/faq.rst
 create mode 100644 Documentation/translations/zh_CN/mm/damon/index.rst
 create mode 100644 Documentation/translations/zh_CN/mm/free_page_reporting.rst
 create mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst
 create mode 100644 Documentation/translations/zh_CN/mm/highmem.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hmm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
 create mode 100644 Documentation/translations/zh_CN/mm/hwpoison.rst
 create mode 100644 Documentation/translations/zh_CN/mm/index.rst
 create mode 100644 Documentation/translations/zh_CN/mm/ksm.rst
 create mode 100644 Documentation/translations/zh_CN/mm/memory-model.rst
 create mode 100644 Documentation/translations/zh_CN/mm/mmu_notifier.rst
 create mode 100644 Documentation/translations/zh_CN/mm/numa.rst
 create mode 100644 Documentation/translations/zh_CN/mm/overcommit-accounting.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_frags.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_owner.rst
 create mode 100644 Documentation/translations/zh_CN/mm/page_table_check.rst
 create mode 100644 Documentation/translations/zh_CN/mm/remap_file_pages.rst
 create mode 100644 Documentation/translations/zh_CN/mm/split_page_table_lock.rst
 create mode 100644 Documentation/translations/zh_CN/mm/z3fold.rst
 create mode 100644 Documentation/translations/zh_CN/mm/zsmalloc.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/active_mm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/balance.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/api.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/design.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/faq.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/damon/index.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/free_page_reporting.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/frontswap.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/highmem.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hmm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/hwpoison.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/index.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/ksm.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/memory-model.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/mmu_notifier.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/numa.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/overcommit-accounting.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_frags.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_owner.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/page_table_check.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/remap_file_pages.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/split_page_table_lock.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/z3fold.rst
 delete mode 100644 Documentation/translations/zh_CN/vm/zsmalloc.rst
 delete mode 100644 Documentation/vm/.gitignore
 delete mode 100644 Documentation/vm/active_mm.rst
 delete mode 100644 Documentation/vm/arch_pgtable_helpers.rst
 delete mode 100644 Documentation/vm/balance.rst
 delete mode 100644 Documentation/vm/bootmem.rst
 delete mode 100644 Documentation/vm/damon/api.rst
 delete mode 100644 Documentation/vm/damon/design.rst
 delete mode 100644 Documentation/vm/damon/faq.rst
 delete mode 100644 Documentation/vm/damon/index.rst
 delete mode 100644 Documentation/vm/free_page_reporting.rst
 delete mode 100644 Documentation/vm/frontswap.rst
 delete mode 100644 Documentation/vm/highmem.rst
 delete mode 100644 Documentation/vm/hmm.rst
 delete mode 100644 Documentation/vm/hugetlbfs_reserv.rst
 delete mode 100644 Documentation/vm/hwpoison.rst
 delete mode 100644 Documentation/vm/index.rst
 delete mode 100644 Documentation/vm/ksm.rst
 delete mode 100644 Documentation/vm/memory-model.rst
 delete mode 100644 Documentation/vm/mmu_notifier.rst
 delete mode 100644 Documentation/vm/numa.rst
 delete mode 100644 Documentation/vm/oom.rst
 delete mode 100644 Documentation/vm/overcommit-accounting.rst
 delete mode 100644 Documentation/vm/page_allocation.rst
 delete mode 100644 Documentation/vm/page_cache.rst
 delete mode 100644 Documentation/vm/page_frags.rst
 delete mode 100644 Documentation/vm/page_migration.rst
 delete mode 100644 Documentation/vm/page_owner.rst
 delete mode 100644 Documentation/vm/page_reclaim.rst
 delete mode 100644 Documentation/vm/page_table_check.rst
 delete mode 100644 Documentation/vm/page_tables.rst
 delete mode 100644 Documentation/vm/physical_memory.rst
 delete mode 100644 Documentation/vm/process_addrs.rst
 delete mode 100644 Documentation/vm/remap_file_pages.rst
 delete mode 100644 Documentation/vm/shmfs.rst
 delete mode 100644 Documentation/vm/slab.rst
 delete mode 100644 Documentation/vm/slub.rst
 delete mode 100644 Documentation/vm/split_page_table_lock.rst
 delete mode 100644 Documentation/vm/swap.rst
 delete mode 100644 Documentation/vm/transhuge.rst
 delete mode 100644 Documentation/vm/unevictable-lru.rst
 delete mode 100644 Documentation/vm/vmalloc.rst
 delete mode 100644 Documentation/vm/vmalloced-kernel-stacks.rst
 delete mode 100644 Documentation/vm/vmemmap_dedup.rst
 delete mode 100644 Documentation/vm/z3fold.rst
 delete mode 100644 Documentation/vm/zsmalloc.rst

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 1c9bed5595f5..d244674a9480 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -41,7 +41,7 @@ Description:	Kernel Samepage Merging daemon sysfs interface
 		sleep_millisecs: how many milliseconds ksm should sleep between
 		scans.
 
-		See Documentation/vm/ksm.rst for more information.
+		See Documentation/mm/ksm.rst for more information.
 
 What:		/sys/kernel/mm/ksm/merge_across_nodes
 Date:		January 2013
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index c440f4946e12..cd5fb8fa3ddf 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
 		The alloc_calls file is read-only and lists the kernel code
 		locations from which allocations for this cache were performed.
 		The alloc_calls file only contains information if debugging is
-		enabled for that cache (see Documentation/vm/slub.rst).
+		enabled for that cache (see Documentation/mm/slub.rst).
 
 What:		/sys/kernel/slab/<cache>/alloc_fastpath
 Date:		February 2008
@@ -219,7 +219,7 @@ Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 Description:
 		The free_calls file is read-only and lists the locations of
 		object frees if slab debugging is enabled (see
-		Documentation/vm/slub.rst).
+		Documentation/mm/slub.rst).
 
 What:		/sys/kernel/slab/<cache>/free_fastpath
 Date:		February 2008
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..8c0ea6b6c6a9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5442,7 +5442,7 @@
 			cache (risks via metadata attacks are mostly
 			unchanged). Debug options disable merging on their
 			own.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slab_max_order=	[MM, SLAB]
 			Determines the maximum allowed order for slabs.
@@ -5456,13 +5456,13 @@
 			slub_debug can create guard zones around objects and
 			may poison objects when not in use. Also tracks the
 			last alloc / free. For more information see
-			Documentation/vm/slub.rst.
+			Documentation/mm/slub.rst.
 
 	slub_max_order= [MM, SLUB]
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
 			fragmentation. For more information see
-			Documentation/vm/slub.rst.
+			Documentation/mm/slub.rst.
 
 	slub_min_objects=	[MM, SLUB]
 			The minimum number of objects per slab. SLUB will
@@ -5471,12 +5471,12 @@
 			the number of objects indicated. The higher the number
 			of objects the smaller the overhead of tracking slabs
 			and the less frequently locks need to be acquired.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slub_min_order=	[MM, SLUB]
 			Determines the minimum page order for slabs. Must be
 			lower than slub_max_order.
-			For more information see Documentation/vm/slub.rst.
+			For more information see Documentation/mm/slub.rst.
 
 	slub_merge	[MM, SLUB]
 			Same with slab_merge.
diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst
index b966fcff993b..c79f1e336222 100644
--- a/Documentation/admin-guide/mm/concepts.rst
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux
 constructs an independent memory management subsystem. A node has its
 own set of zones, lists of free and used pages and various statistics
 counters. You can find more details about NUMA in
-:ref:`Documentation/vm/numa.rst <numa>` and in
+:ref:`Documentation/mm/numa.rst <numa>` and in
 :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
 
 Page cache
diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 61aff88347f3..c4681fa69b9c 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -4,7 +4,7 @@
 Monitoring Data Accesses
 ========================
 
-:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
+:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
 Using DAMON, users can analyze the memory access patterns of their systems and
 optimize those.
 
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
index 46306f1f34b1..a8bd3bd29959 100644
--- a/Documentation/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -268,4 +268,4 @@ granularity reclamation. ::
 
 .. [1] https://research.google/pubs/pub48551/
 .. [2] https://lwn.net/Articles/787611/
-.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
+.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 1bb7b72414b2..5540a3a40fc9 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -30,11 +30,11 @@ DAMON provides below interfaces for different users.
   <sysfs_interface>`.  This will be removed after next LTS kernel is released,
   so users should move to the :ref:`sysfs interface <sysfs_interface>`.
 - *Kernel Space Programming Interface.*
-  :doc:`This </vm/damon/api>` is for kernel space programmers.  Using this,
+  :doc:`This </mm/damon/api>` is for kernel space programmers.  Using this,
   users can utilize every feature of DAMON most flexibly and efficiently by
   writing kernel space DAMON application programs for you.  You can even extend
   DAMON for various address spaces.  For detail, please refer to the interface
-  :doc:`document </vm/damon/api>`.
+  :doc:`document </mm/damon/api>`.
 
 .. _sysfs_interface:
 
@@ -185,7 +185,7 @@ controls the monitoring overhead, exist.  You can set and get the values by
 writing to and rading from the files.
 
 For more details about the intervals and monitoring regions range, please refer
-to the Design document (:doc:`/vm/damon/design`).
+to the Design document (:doc:`/mm/damon/design`).
 
 contexts/<N>/targets/
 ---------------------
@@ -402,7 +402,7 @@ Attributes
 Users can get and set the ``sampling interval``, ``aggregation interval``,
 ``update interval``, and min/max number of monitoring target regions by
 reading from and writing to the ``attrs`` file.  To know about the monitoring
-attributes in detail, please refer to the :doc:`/vm/damon/design`.  For
+attributes in detail, please refer to the :doc:`/mm/damon/design`.  For
 example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
 1000, and then check it again::
 
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 5c9aa171a0d3..4a440a7cfeb0 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -760,7 +760,7 @@ and don't use much of it.
 
 The default value is 0.
 
-See Documentation/vm/overcommit-accounting.rst and
+See Documentation/mm/overcommit-accounting.rst and
 mm/util.c::__vm_enough_memory() for more information.
 
 
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index dedd4d853329..5b1188494bcd 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -87,7 +87,7 @@ Memory management
 =================
 
 How to allocate and use memory in the kernel.  Note that there is a lot
-more memory-management documentation in Documentation/vm/index.rst.
+more memory-management documentation in Documentation/mm/index.rst.
 
 .. toctree::
    :maxdepth: 1
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 1bc91fb8c321..8543a59f288f 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1109,7 +1109,7 @@ CommitLimit
               yield a CommitLimit of 7.3G.
 
               For more details, see the memory overcommit documentation
-              in vm/overcommit-accounting.
+              in mm/overcommit-accounting.
 Committed_AS
               The amount of memory presently allocated on the system.
               The committed memory is a sum of all of the memory which
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 67036a05b771..4737c18c97ff 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -128,7 +128,7 @@ needed).
    sound/index
    crypto/index
    filesystems/index
-   vm/index
+   mm/index
    bpf/index
    usb/index
    PCI/index
diff --git a/Documentation/mm/active_mm.rst b/Documentation/mm/active_mm.rst
new file mode 100644
index 000000000000..6f8269c284ed
--- /dev/null
+++ b/Documentation/mm/active_mm.rst
@@ -0,0 +1,91 @@
+.. _active_mm:
+
+=========
+Active MM
+=========
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ Cc'd to linux-kernel, because I don't write explanations all that often,
+ and when I do I feel better about more people reading them.
+
+ On Fri, 30 Jul 1999, David Mosberger wrote:
+ >
+ > Is there a brief description someplace on how "mm" vs. "active_mm" in
+ > the task_struct are supposed to be used?  (My apologies if this was
+ > discussed on the mailing lists---I just returned from vacation and
+ > wasn't able to follow linux-kernel for a while).
+
+ Basically, the new setup is:
+
+  - we have "real address spaces" and "anonymous address spaces". The
+    difference is that an anonymous address space doesn't care about the
+    user-level page tables at all, so when we do a context switch into an
+    anonymous address space we just leave the previous address space
+    active.
+
+    The obvious use for a "anonymous address space" is any thread that
+    doesn't need any user mappings - all kernel threads basically fall into
+    this category, but even "real" threads can temporarily say that for
+    some amount of time they are not going to be interested in user space,
+    and that the scheduler might as well try to avoid wasting time on
+    switching the VM state around. Currently only the old-style bdflush
+    sync does that.
+
+  - "tsk->mm" points to the "real address space". For an anonymous process,
+    tsk->mm will be NULL, for the logical reason that an anonymous process
+    really doesn't _have_ a real address space at all.
+
+  - however, we obviously need to keep track of which address space we
+    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
+    which shows what the currently active address space is.
+
+    The rule is that for a process with a real address space (ie tsk->mm is
+    non-NULL) the active_mm obviously always has to be the same as the real
+    one.
+
+    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
+    "borrowed" mm while the anonymous process is running. When the
+    anonymous process gets scheduled away, the borrowed address space is
+    returned and cleared.
+
+ To support all that, the "struct mm_struct" now has two counters: a
+ "mm_users" counter that is how many "real address space users" there are,
+ and a "mm_count" counter that is the number of "lazy" users (ie anonymous
+ users) plus one if there are any real users.
+
+ Usually there is at least one real user, but it could be that the real
+ user exited on another CPU while a lazy user was still active, so you do
+ actually get cases where you have a address space that is _only_ used by
+ lazy users. That is often a short-lived state, because once that thread
+ gets scheduled away in favour of a real thread, the "zombie" mm gets
+ released because "mm_count" becomes zero.
+
+ Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
+ more. "init_mm" should be considered just a "lazy context when no other
+ context is available", and in fact it is mainly used just at bootup when
+ no real VM has yet been created. So code that used to check
+
+ 	if (current->mm == &init_mm)
+
+ should generally just do
+
+ 	if (!current->mm)
+
+ instead (which makes more sense anyway - the test is basically one of "do
+ we have a user context", and is generally done by the page fault handler
+ and things like that).
+
+ Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
+ because it slightly changes the interfaces to accommodate the alpha (who
+ would have thought it, but the alpha actually ends up having one of the
+ ugliest context switch codes - unlike the other architectures where the MM
+ and register state is separate, the alpha PALcode joins the two, and you
+ need to switch both together).
+
+ (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
new file mode 100644
index 000000000000..cbaee9e59241
--- /dev/null
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -0,0 +1,260 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _arch_page_table_helpers:
+
+===============================
+Architecture Page Table Helpers
+===============================
+
+Generic MM expects architectures (with MMU) to provide helpers to create, access
+and modify page table entries at various level for different memory functions.
+These page table helpers need to conform to a common semantics across platforms.
+Following tables describe the expected semantics which can also be tested during
+boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
+test need to be in sync.
+
+
+PTE Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pte_same                  | Tests whether both PTE entries are the same      |
++---------------------------+--------------------------------------------------+
+| pte_bad                   | Tests a non-table mapped PTE                     |
++---------------------------+--------------------------------------------------+
+| pte_present               | Tests a valid mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_young                 | Tests a young PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_dirty                 | Tests a dirty PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_write                 | Tests a writable PTE                             |
++---------------------------+--------------------------------------------------+
+| pte_special               | Tests a special PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_protnone              | Tests a PROT_NONE PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_soft_dirty            | Tests a soft dirty PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_mkyoung               | Creates a young PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkold                 | Creates an old PTE                               |
++---------------------------+--------------------------------------------------+
+| pte_mkdirty               | Creates a dirty PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkclean               | Creates a clean PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkwrite               | Creates a writable PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_wrprotect             | Creates a write protected PTE                    |
++---------------------------+--------------------------------------------------+
+| pte_mkspecial             | Creates a special PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
++---------------------------+--------------------------------------------------+
+| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
++---------------------------+--------------------------------------------------+
+| pte_mknotpresent          | Invalidates a mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| ptep_clear                | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear        | Clears and returns PTE                           |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear_full   | Clears and returns PTE (batched PTE unmap)       |
++---------------------------+--------------------------------------------------+
+| ptep_test_and_clear_young | Clears young from a PTE                          |
++---------------------------+--------------------------------------------------+
+| ptep_set_wrprotect        | Converts into a write protected PTE              |
++---------------------------+--------------------------------------------------+
+| ptep_set_access_flags     | Converts into a more permissive PTE              |
++---------------------------+--------------------------------------------------+
+
+
+PMD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pmd_same                  | Tests whether both PMD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pmd_bad                   | Tests a non-table mapped PMD                     |
++---------------------------+--------------------------------------------------+
+| pmd_leaf                  | Tests a leaf mapped PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
++---------------------------+--------------------------------------------------+
+| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
++---------------------------+--------------------------------------------------+
+| pmd_present               | Tests a valid mapped PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_young                 | Tests a young PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_dirty                 | Tests a dirty PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_write                 | Tests a writable PMD                             |
++---------------------------+--------------------------------------------------+
+| pmd_special               | Tests a special PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_protnone              | Tests a PROT_NONE PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_soft_dirty            | Tests a soft dirty PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_mkyoung               | Creates a young PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkold                 | Creates an old PMD                               |
++---------------------------+--------------------------------------------------+
+| pmd_mkdirty               | Creates a dirty PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkclean               | Creates a clean PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrite               | Creates a writable PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_wrprotect             | Creates a write protected PMD                    |
++---------------------------+--------------------------------------------------+
+| pmd_mkspecial             | Creates a special PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
++---------------------------+--------------------------------------------------+
+| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
++---------------------------+--------------------------------------------------+
+| pmd_set_huge              | Creates a PMD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pmd_clear_huge            | Clears a PMD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear        | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear_full   | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_test_and_clear_young | Clears young from a PMD                          |
++---------------------------+--------------------------------------------------+
+| pmdp_set_wrprotect        | Converts into a write protected PMD              |
++---------------------------+--------------------------------------------------+
+| pmdp_set_access_flags     | Converts into a more permissive PMD              |
++---------------------------+--------------------------------------------------+
+
+
+PUD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pud_same                  | Tests whether both PUD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pud_bad                   | Tests a non-table mapped PUD                     |
++---------------------------+--------------------------------------------------+
+| pud_leaf                  | Tests a leaf mapped PUD                          |
++---------------------------+--------------------------------------------------+
+| pud_huge                  | Tests a HugeTLB mapped PUD                       |
++---------------------------+--------------------------------------------------+
+| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
++---------------------------+--------------------------------------------------+
+| pud_present               | Tests a valid mapped PUD                         |
++---------------------------+--------------------------------------------------+
+| pud_young                 | Tests a young PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_dirty                 | Tests a dirty PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_write                 | Tests a writable PUD                             |
++---------------------------+--------------------------------------------------+
+| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
++---------------------------+--------------------------------------------------+
+| pud_mkyoung               | Creates a young PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkold                 | Creates an old PUD                               |
++---------------------------+--------------------------------------------------+
+| pud_mkdirty               | Creates a dirty PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkclean               | Creates a clean PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkwrite               | Creates a writable PUD                           |
++---------------------------+--------------------------------------------------+
+| pud_wrprotect             | Creates a write protected PUD                    |
++---------------------------+--------------------------------------------------+
+| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
++---------------------------+--------------------------------------------------+
+| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
++---------------------------+--------------------------------------------------+
+| pud_set_huge              | Creates a PUD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pud_clear_huge            | Clears a PUD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear        | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear_full   | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_test_and_clear_young | Clears young from a PUD                          |
++---------------------------+--------------------------------------------------+
+| pudp_set_wrprotect        | Converts into a write protected PUD              |
++---------------------------+--------------------------------------------------+
+| pudp_set_access_flags     | Converts into a more permissive PUD              |
++---------------------------+--------------------------------------------------+
+
+
+HugeTLB Page Table Helpers
+==========================
+
++---------------------------+--------------------------------------------------+
+| pte_huge                  | Tests a HugeTLB                                  |
++---------------------------+--------------------------------------------------+
+| pte_mkhuge                | Creates a HugeTLB                                |
++---------------------------+--------------------------------------------------+
+| huge_pte_dirty            | Tests a dirty HugeTLB                            |
++---------------------------+--------------------------------------------------+
+| huge_pte_write            | Tests a writable HugeTLB                         |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
++---------------------------+--------------------------------------------------+
+| huge_pte_wrprotect        | Creates a write protected HugeTLB                |
++---------------------------+--------------------------------------------------+
+| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
++---------------------------+--------------------------------------------------+
+
+
+SWAP Page Table Helpers
+========================
+
++---------------------------+--------------------------------------------------+
+| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
++---------------------------+--------------------------------------------------+
+| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
++---------------------------+--------------------------------------------------+
+| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| is_migration_entry        | Tests a migration (read or write) swapped entry  |
++-------------------------------+----------------------------------------------+
+| is_writable_migration_entry   | Tests a write migration swapped entry        |
++-------------------------------+----------------------------------------------+
+| make_readable_migration_entry | Creates a read migration swapped entry       |
++-------------------------------+----------------------------------------------+
+| make_writable_migration_entry | Creates a write migration swapped entry      |
++-------------------------------+----------------------------------------------+
+
+[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst
new file mode 100644
index 000000000000..6a1fadf3e173
--- /dev/null
+++ b/Documentation/mm/balance.rst
@@ -0,0 +1,102 @@
+.. _balance:
+
+================
+Memory Balancing
+================
+
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+well as for non __GFP_IO allocations.
+
+The first reason why a caller may avoid reclaim is that the caller can not
+sleep due to holding a spinlock or is in interrupt context. The second may
+be that the caller is willing to fail the allocation without incurring the
+overhead of page reclaim. This may happen for opportunistic high-order
+allocation requests that have order-0 fallback options. In such cases,
+the caller may also wish to avoid waking kswapd.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done,
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
+structure to decide whether a zone needs balancing.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
+are per-zone fields, used to determine when a zone needs to be balanced. When
+the number of pages falls below watermark[WMARK_MIN], the hysteric field
+low_on_memory gets set. This stays set till the number of free pages becomes
+watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
+try to free some pages in the zone (providing GFP_WAIT is set in the request).
+Orthogonal to this, is the decision to poke kswapd to free some zone pages.
+That decision is not hysteresis based, and is done when the number of free
+pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
+
+
+(Good) Ideas that I have heard:
+
+1. Dynamic experience should influence balancing: number of failed requests
+   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/mm/bootmem.rst b/Documentation/mm/bootmem.rst
new file mode 100644
index 000000000000..eb2b31eedfa1
--- /dev/null
+++ b/Documentation/mm/bootmem.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Boot Memory
+===========
diff --git a/Documentation/mm/damon/api.rst b/Documentation/mm/damon/api.rst
new file mode 100644
index 000000000000..08f34df45523
--- /dev/null
+++ b/Documentation/mm/damon/api.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+API Reference
+=============
+
+Kernel space programs can use every feature of DAMON using below APIs.  All you
+need to do is including ``damon.h``, which is located in ``include/linux/`` of
+the source tree.
+
+Structures
+==========
+
+.. kernel-doc:: include/linux/damon.h
+
+
+Functions
+=========
+
+.. kernel-doc:: mm/damon/core.c
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
new file mode 100644
index 000000000000..0cff6fac6b7e
--- /dev/null
+++ b/Documentation/mm/damon/design.rst
@@ -0,0 +1,176 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+Design
+======
+
+Configurable Layers
+===================
+
+DAMON provides data access monitoring functionality while making the accuracy
+and the overhead controllable.  The fundamental access monitorings require
+primitives that dependent on and optimized for the target address space.  On
+the other hand, the accuracy and overhead tradeoff mechanism, which is the core
+of DAMON, is in the pure logic space.  DAMON separates the two parts in
+different layers and defines its interface to allow various low level
+primitives implementations configurable with the core logic.  We call the low
+level primitives implementations monitoring operations.
+
+Due to this separated design and the configurable interface, users can extend
+DAMON for any address space by configuring the core logics with appropriate
+monitoring operations.  If appropriate one is not provided, users can implement
+the operations on their own.
+
+For example, physical memory, virtual memory, swap space, those for specific
+processes, NUMA nodes, files, and backing memory devices would be supportable.
+Also, if some architectures or devices support special optimized access check
+primitives, those will be easily configurable.
+
+
+Reference Implementations of Address Space Specific Monitoring Operations
+=========================================================================
+
+The monitoring operations are defined in two parts:
+
+1. Identification of the monitoring target address range for the address space.
+2. Access check of specific address range in the target space.
+
+DAMON currently provides the implementations of the operations for the physical
+and virtual address spaces. Below two subsections describe how those work.
+
+
+VMA-based Target Address Range Construction
+-------------------------------------------
+
+This is only for the virtual address space monitoring operations
+implementation.  That for the physical address space simply asks users to
+manually set the monitoring target address ranges.
+
+Only small parts in the super-huge virtual address space of the processes are
+mapped to the physical memory and accessed.  Thus, tracking the unmapped
+address regions is just wasteful.  However, because DAMON can deal with some
+level of noise using the adaptive regions adjustment mechanism, tracking every
+mapping is not strictly required but could even incur a high overhead in some
+cases.  That said, too huge unmapped areas inside the monitoring target should
+be removed to not take the time for the adaptive mechanism.
+
+For the reason, this implementation converts the complex mappings to three
+distinct regions that cover every mapped area of the address space.  The two
+gaps between the three regions are the two biggest unmapped areas in the given
+address space.  The two biggest unmapped areas would be the gap between the
+heap and the uppermost mmap()-ed region, and the gap between the lowermost
+mmap()-ed region and the stack in most of the cases.  Because these gaps are
+exceptionally huge in usual address spaces, excluding these will be sufficient
+to make a reasonable trade-off.  Below shows this in detail::
+
+    <heap>
+    <BIG UNMAPPED REGION 1>
+    <uppermost mmap()-ed region>
+    (small mmap()-ed regions and munmap()-ed regions)
+    <lowermost mmap()-ed region>
+    <BIG UNMAPPED REGION 2>
+    <stack>
+
+
+PTE Accessed-bit Based Access Check
+-----------------------------------
+
+Both of the implementations for physical and virtual address spaces use PTE
+Accessed-bit for basic access checks.  Only one difference is the way of
+finding the relevant PTE Accessed bit(s) from the address.  While the
+implementation for the virtual address walks the page table for the target task
+of the address, the implementation for the physical address walks every page
+table having a mapping to the address.  In this way, the implementations find
+and clear the bit(s) for next sampling target address and checks whether the
+bit(s) set again after one sampling period.  This could disturb other kernel
+subsystems using the Accessed bits, namely Idle page tracking and the reclaim
+logic.  DAMON does nothing to avoid disturbing Idle page tracking, so handling
+the interference is the responsibility of sysadmins.  However, it solves the
+conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
+as Idle page tracking does.
+
+
+Address Space Independent Core Mechanisms
+=========================================
+
+Below four sections describe each of the DAMON core mechanisms and the five
+monitoring attributes, ``sampling interval``, ``aggregation interval``,
+``update interval``, ``minimum number of regions``, and ``maximum number of
+regions``.
+
+
+Access Frequency Monitoring
+---------------------------
+
+The output of DAMON says what pages are how frequently accessed for a given
+duration.  The resolution of the access frequency is controlled by setting
+``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
+access to each page per ``sampling interval`` and aggregates the results.  In
+other words, counts the number of the accesses to each page.  After each
+``aggregation interval`` passes, DAMON calls callback functions that previously
+registered by users so that users can read the aggregated results and then
+clears the results.  This can be described in below simple pseudo-code::
+
+    while monitoring_on:
+        for page in monitoring_target:
+            if accessed(page):
+                nr_accesses[page] += 1
+        if time() % aggregation_interval == 0:
+            for callback in user_registered_callbacks:
+                callback(monitoring_target, nr_accesses)
+            for page in monitoring_target:
+                nr_accesses[page] = 0
+        sleep(sampling interval)
+
+The monitoring overhead of this mechanism will arbitrarily increase as the
+size of the target workload grows.
+
+
+Region Based Sampling
+---------------------
+
+To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
+that assumed to have the same access frequencies into a region.  As long as the
+assumption (pages in a region have the same access frequencies) is kept, only
+one page in the region is required to be checked.  Thus, for each ``sampling
+interval``, DAMON randomly picks one page in each region, waits for one
+``sampling interval``, checks whether the page is accessed meanwhile, and
+increases the access frequency of the region if so.  Therefore, the monitoring
+overhead is controllable by setting the number of regions.  DAMON allows users
+to set the minimum and the maximum number of regions for the trade-off.
+
+This scheme, however, cannot preserve the quality of the output if the
+assumption is not guaranteed.
+
+
+Adaptive Regions Adjustment
+---------------------------
+
+Even somehow the initial monitoring target regions are well constructed to
+fulfill the assumption (pages in same region have similar access frequencies),
+the data access pattern can be dynamically changed.  This will result in low
+monitoring quality.  To keep the assumption as much as possible, DAMON
+adaptively merges and splits each region based on their access frequency.
+
+For each ``aggregation interval``, it compares the access frequencies of
+adjacent regions and merges those if the frequency difference is small.  Then,
+after it reports and clears the aggregated access frequency of each region, it
+splits each region into two or three regions if the total number of regions
+will not exceed the user-specified maximum number of regions after the split.
+
+In this way, DAMON provides its best-effort quality and minimal overhead while
+keeping the bounds users set for their trade-off.
+
+
+Dynamic Target Space Updates Handling
+-------------------------------------
+
+The monitoring target address range could dynamically changed.  For example,
+virtual memory could be dynamically mapped and unmapped.  Physical memory could
+be hot-plugged.
+
+As the changes could be quite frequent in some cases, DAMON allows the
+monitoring operations to check dynamic changes including memory mapping changes
+and applies it to monitoring operations-related data structures such as the
+abstracted monitoring target memory area only for each of a user-specified time
+interval (``update interval``).
diff --git a/Documentation/mm/damon/faq.rst b/Documentation/mm/damon/faq.rst
new file mode 100644
index 000000000000..dde7e2414ee6
--- /dev/null
+++ b/Documentation/mm/damon/faq.rst
@@ -0,0 +1,50 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Frequently Asked Questions
+==========================
+
+Why a new subsystem, instead of extending perf or other user space tools?
+=========================================================================
+
+First, because it needs to be lightweight as much as possible so that it can be
+used online, any unnecessary overhead such as kernel - user space context
+switching cost should be avoided.  Second, DAMON aims to be used by other
+programs including the kernel.  Therefore, having a dependency on specific
+tools like perf is not desirable.  These are the two biggest reasons why DAMON
+is implemented in the kernel space.
+
+
+Can 'idle pages tracking' or 'perf mem' substitute DAMON?
+=========================================================
+
+Idle page tracking is a low level primitive for access check of the physical
+address space.  'perf mem' is similar, though it can use sampling to minimize
+the overhead.  On the other hand, DAMON is a higher-level framework for the
+monitoring of various address spaces.  It is focused on memory management
+optimization and provides sophisticated accuracy/overhead handling mechanisms.
+Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
+DAMON's output, but cannot substitute DAMON.
+
+
+Does DAMON support virtual memory only?
+=======================================
+
+No.  The core of the DAMON is address space independent.  The address space
+specific monitoring operations including monitoring target regions
+constructions and actual access checks can be implemented and configured on the
+DAMON core by the users.  In this way, DAMON users can monitor any address
+space with any access check technique.
+
+Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
+implementations of the address space dependent functions for the virtual memory
+and the physical memory by default, for a reference and convenient use.
+
+
+Can I simply monitor page granularity?
+======================================
+
+Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
+working set size divided by the page size.  Because the monitoring target
+regions size is forced to be ``>=page size``, the region split will make no
+effect.
diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst
new file mode 100644
index 000000000000..48c0bbff98b2
--- /dev/null
+++ b/Documentation/mm/damon/index.rst
@@ -0,0 +1,29 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+DAMON: Data Access MONitor
+==========================
+
+DAMON is a data access monitoring framework subsystem for the Linux kernel.
+The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
+
+ - *accurate* (the monitoring output is useful enough for DRAM level memory
+   management; It might not appropriate for CPU Cache levels, though),
+ - *light-weight* (the monitoring overhead is low enough to be applied online),
+   and
+ - *scalable* (the upper-bound of the overhead is in constant range regardless
+   of the size of target workloads).
+
+Using this framework, therefore, the kernel's memory management mechanisms can
+make advanced decisions.  Experimental memory management optimization works
+that incurring high data accesses monitoring overhead could implemented again.
+In user space, meanwhile, users who have some special workloads can write
+personalized applications for better understanding and optimizations of their
+workloads and systems.
+
+.. toctree::
+   :maxdepth: 2
+
+   faq
+   design
+   api
diff --git a/Documentation/mm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst
new file mode 100644
index 000000000000..8c05e62d8b2b
--- /dev/null
+++ b/Documentation/mm/free_page_reporting.rst
@@ -0,0 +1,40 @@
+.. _free_page_reporting:
+
+=====================
+Free Page Reporting
+=====================
+
+Free page reporting is an API by which a device can register to receive
+lists of pages that are currently unused by the system. This is useful in
+the case of virtualization where a guest is then able to use this data to
+notify the hypervisor that it is no longer using certain pages in memory.
+
+For the driver, typically a balloon driver, to use of this functionality
+it will allocate and initialize a page_reporting_dev_info structure. The
+field within the structure it will populate is the "report" function
+pointer used to process the scatterlist. It must also guarantee that it can
+handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
+call to the function. A call to page_reporting_register will register the
+page reporting interface with the reporting framework assuming no other
+page reporting devices are already registered.
+
+Once registered the page reporting API will begin reporting batches of
+pages to the driver. The API will start reporting pages 2 seconds after
+the interface is registered and will continue to do so 2 seconds after any
+page of a sufficiently high order is freed.
+
+Pages reported will be stored in the scatterlist passed to the reporting
+function with the final entry having the end bit set in entry nent - 1.
+While pages are being processed by the report function they will not be
+accessible to the allocator. Once the report function has been completed
+the pages will be returned to the free area from which they were obtained.
+
+Prior to removing a driver that is making use of free page reporting it
+is necessary to call page_reporting_unregister to have the
+page_reporting_dev_info structure that is currently in use by free page
+reporting removed. Doing this will prevent further reports from being
+issued via the interface. If another driver or the same driver is
+registered it is possible for it to resume where the previous driver had
+left off in terms of reporting free pages.
+
+Alexander Duyck, Dec 04, 2019
diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst
new file mode 100644
index 000000000000..feecc5e24477
--- /dev/null
+++ b/Documentation/mm/frontswap.rst
@@ -0,0 +1,266 @@
+.. _frontswap:
+
+=========
+Frontswap
+=========
+
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size.  The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "store" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "load" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from transcendent memory.  An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further stores with that swap type.
+
+Once a page is successfully stored, a matching load on the page will normally
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the store returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a store returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+Note that if a page is stored and the page already exists in transcendent memory
+(a "duplicate" store), either the store succeeds and the data is overwritten,
+or the store fails AND the page is invalidated.  This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+``failed_stores``
+	how many store attempts have failed
+
+``loads``
+	how many loads were attempted (all should succeed)
+
+``succ_stores``
+	how many store attempts have succeeded
+
+``invalidates``
+	how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+===
+
+* Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device".
+
+Frontswap with a fairly small impact on the kernel,
+provides a huge amount of flexibility for more dynamic, flexible RAM
+utilization in various system configurations:
+
+In the single kernel case, aka "zcache", pages are compressed and
+stored in local memory, thus increasing the total anonymous pages
+that can be safely kept in RAM.  Zcache essentially trades off CPU
+cycles used in compression/decompression for better memory utilization.
+Benchmarks have shown little or no impact when memory pressure is
+low while providing a significant performance improvement (25%+)
+on some workloads under high memory pressure.
+
+"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
+support for clustered systems.  Frontswap pages are locally compressed
+as in zcache, but then "remotified" to another system's RAM.  This
+allows RAM to be dynamically load-balanced back-and-forth as needed,
+i.e. when system A is overcommitted, it can swap to system B, and
+vice versa.  RAMster can also be configured as a memory server so
+many servers in a cluster can swap, dynamically as needed, to a single
+server configured with a large amount of RAM... without pre-configuring
+how much of the RAM is available for each of the clients!
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "selfballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM (if overall host system memory
+conditions allow), thus mitigating the potentially awful performance impact
+of unplanned swapping.
+
+A KVM implementation is underway and has been RFC'ed to lkml.  And,
+using frontswap, investigation is also underway on the use of NVM as
+a memory extension technology.
+
+* Sure there may be performance advantages in some situations, but
+  what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "store"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+When swap pages are stored in transcendent memory instead of written
+out to disk, there is a side effect that this may create more memory
+pressure that can potentially outweigh the other advantages.  A
+backend, such as zcache, must implement policies to carefully (but
+dynamically) manage memory limits to ensure this doesn't happen.
+
+* OK, how about a quick overview of what this frontswap patch does
+  in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "store" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_store is called.  Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_store returns -1 and the kernel swaps the page
+to the swap device as normal.  Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_load() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+store" and (possibly) a "frontswap backend loads", which are presumably much
+faster.
+
+* Can't frontswap be configured as a "special" swap device that is
+  just higher priority than any real swap device (e.g. like zswap,
+  or maybe swap-over-nbd/NFS)?
+
+No.  First, the existing swap subsystem doesn't allow for any kind of
+swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
+but this would require fairly drastic changes.  Even if it were
+rewritten, the existing swap subsystem uses the block I/O layer which
+assumes a swap device is fixed size and any page in it is linearly
+addressable.  Frontswap barely touches the existing swap subsystem,
+and works around the constraints of the block I/O subsystem to provide
+a great deal of flexibility and dynamicity.
+
+For example, the acceptance of any swap page by the frontswap backend is
+entirely unpredictable. This is critical to the definition of frontswap
+backends because it grants completely dynamic discretion to the
+backend.  In zcache, one cannot know a priori how compressible a page is.
+"Poorly" compressible pages can be rejected, and "poorly" can itself be
+defined dynamically depending on current memory constraints.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.  Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem.  That said, only the initial "store"
+and "load" operations need be synchronous.  A separate asynchronous thread
+is free to manipulate the pages stored by frontswap.  For example,
+the "remotification" thread in RAMster uses standard asynchronous
+kernel sockets to move compressed frontswap pages to a remote machine.
+Similarly, a KVM guest-side implementation could do in-guest compression
+and use "batched" hypercalls.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+There is a downside to the transcendent memory specifications for
+frontswap:  Since any "store" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.  This means
+that frontswap cannot contain more pages than the total of swapon'd
+swap devices.  For example, if NO swap device is configured on some
+installation, frontswap is useless.  Swapless portable devices
+can still use frontswap but a backend for such devices must configure
+some kind of "ghost" swap device and ensure that it is never used.
+
+* Why this weird definition about "duplicate stores"?  If a page
+  has been previously successfully stored, can't it always be
+  successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the store must be rejected.  Whenever
+frontswap rejects a store that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+* Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst
new file mode 100644
index 000000000000..c9887f241c6c
--- /dev/null
+++ b/Documentation/mm/highmem.rst
@@ -0,0 +1,167 @@
+.. _highmem:
+
+====================
+High Memory Handling
+====================
+
+By: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+.. contents:: :local:
+
+What Is High Memory?
+====================
+
+High memory (highmem) is used when the size of physical memory approaches or
+exceeds the maximum size of virtual memory.  At that point it becomes
+impossible for the kernel to keep all of the available physical memory mapped
+at all times.  This means the kernel needs to start using temporary mappings of
+the pieces of physical memory that it wants to access.
+
+The part of (physical) memory not covered by a permanent mapping is what we
+refer to as 'highmem'.  There are various architecture dependent constraints on
+where exactly that border lies.
+
+In the i386 arch, for example, we choose to map the kernel into every process's
+VM space so that we don't have to pay the full TLB invalidation costs for
+kernel entry/exit.  This means the available virtual memory space (4GiB on
+i386) has to be divided between user and kernel space.
+
+The traditional split for architectures using this approach is 3:1, 3GiB for
+userspace and the top 1GiB for kernel space::
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+This means that the kernel can at most map 1GiB of physical memory at any one
+time, but because we need virtual address space for other things - including
+temporary maps to access the rest of the physical memory - the actual direct
+map will typically be less (usually around ~896MiB).
+
+Other architectures that have mm context tagged TLBs can have separate kernel
+and user maps.  Some hardware (like some ARMs), however, have limited virtual
+space when they use mm context tags.
+
+
+Temporary Virtual Mappings
+==========================
+
+The kernel contains several ways of creating temporary mappings. The following
+list shows them in order of preference of use.
+
+* kmap_local_page().  This function is used to require short term mappings.
+  It can be invoked from any context (including interrupts) but the mappings
+  can only be used in the context which acquired them.
+
+  This function should be preferred, where feasible, over all the others.
+
+  These mappings are thread-local and CPU-local, meaning that the mapping
+  can only be accessed from within this thread and the thread is bound the
+  CPU while the mapping is active. Even if the thread is preempted (since
+  preemption is never disabled by the function) the CPU can not be
+  unplugged from the system via CPU-hotplug until the mapping is disposed.
+
+  It's valid to take pagefaults in a local kmap region, unless the context
+  in which the local mapping is acquired does not allow it for other reasons.
+
+  kmap_local_page() always returns a valid virtual address and it is assumed
+  that kunmap_local() will never fail.
+
+  Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
+  extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
+  because the map implementation is stack based. See kmap_local_page() kdocs
+  (included in the "Functions" section) for details on how to manage nested
+  mappings.
+
+* kmap_atomic().  This permits a very short duration mapping of a single
+  page.  Since the mapping is restricted to the CPU that issued it, it
+  performs well, but the issuing task is therefore required to stay on that
+  CPU until it has finished, lest some other task displace its mappings.
+
+  kmap_atomic() may also be used by interrupt contexts, since it does not
+  sleep and the callers too may not sleep until after kunmap_atomic() is
+  called.
+
+  Each call of kmap_atomic() in the kernel creates a non-preemptible section
+  and disable pagefaults. This could be a source of unwanted latency. Therefore
+  users should prefer kmap_local_page() instead of kmap_atomic().
+
+  It is assumed that k[un]map_atomic() won't fail.
+
+* kmap().  This should be used to make short duration mapping of a single
+  page with no restrictions on preemption or migration. It comes with an
+  overhead as mapping space is restricted and protected by a global lock
+  for synchronization. When mapping is no longer needed, the address that
+  the page was mapped to must be released with kunmap().
+
+  Mapping changes must be propagated across all the CPUs. kmap() also
+  requires global TLB invalidation when the kmap's pool wraps and it might
+  block when the mapping space is fully utilized until a slot becomes
+  available. Therefore, kmap() is only callable from preemptible context.
+
+  All the above work is necessary if a mapping must last for a relatively
+  long time but the bulk of high-memory mappings in the kernel are
+  short-lived and only used in one place. This means that the cost of
+  kmap() is mostly wasted in such cases. kmap() was not intended for long
+  term mappings but it has morphed in that direction and its use is
+  strongly discouraged in newer code and the set of the preceding functions
+  should be preferred.
+
+  On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have
+  no real work to do because a 64-bit address space is more than sufficient to
+  address all the physical memory whose pages are permanently mapped.
+
+* vmap().  This can be used to make a long duration mapping of multiple
+  physical pages into a contiguous virtual space.  It needs global
+  synchronization to unmap.
+
+
+Cost of Temporary Mappings
+==========================
+
+The cost of creating temporary mappings can be quite high.  The arch has to
+manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
+
+If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
+simply with a bit of arithmetic that will convert the page struct address into
+a pointer to the page contents rather than juggling mappings about.  In such a
+case, the unmap operation may be a null operation.
+
+If CONFIG_MMU is not set, then there can be no temporary mappings and no
+highmem.  In such a case, the arithmetic approach will also be used.
+
+
+i386 PAE
+========
+
+The i386 arch, under some circumstances, will permit you to stick up to 64GiB
+of RAM into your 32-bit machine.  This has a number of consequences:
+
+* Linux needs a page-frame structure for each page in the system and the
+  pageframes need to live in the permanent mapping, which means:
+
+* you can have 896M/sizeof(struct page) page-frames at most; with struct
+  page being 32-bytes that would end up being something in the order of 112G
+  worth of pages; the kernel, however, needs to store more than just
+  page-frames in that memory...
+
+* PAE makes your page tables larger - which slows the system down as more
+  data has to be accessed to traverse in TLB fills and the like.  One
+  advantage is that PAE has more PTE bits and can provide advanced features
+  like NX and PAT.
+
+The general recommendation is that you don't use more than 8GiB on a 32-bit
+machine - although more might work for you and your workload, you're pretty
+much on your own - don't expect kernel developers to really care much if things
+come apart.
+
+
+Functions
+=========
+
+.. kernel-doc:: include/linux/highmem.h
+.. kernel-doc:: include/linux/highmem-internal.h
diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst
new file mode 100644
index 000000000000..f2a59ed82ed3
--- /dev/null
+++ b/Documentation/mm/hmm.rst
@@ -0,0 +1,452 @@
+.. _hmm:
+
+=====================================
+Heterogeneous Memory Management (HMM)
+=====================================
+
+Provide infrastructure and helpers to integrate non-conventional memory (device
+memory like GPU on board memory) into regular kernel path, with the cornerstone
+of this being specialized struct page for such memory (see sections 5 to 7 of
+this document).
+
+HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
+allowing a device to transparently access program addresses coherently with
+the CPU meaning that any valid pointer on the CPU is also a valid pointer
+for the device. This is becoming mandatory to simplify the use of advanced
+heterogeneous computing where GPU, DSP, or FPGA are used to perform various
+computations on behalf of a process.
+
+This document is divided as follows: in the first section I expose the problems
+related to using device specific memory allocators. In the second section, I
+expose the hardware limitations that are inherent to many platforms. The third
+section gives an overview of the HMM design. The fourth section explains how
+CPU page-table mirroring works and the purpose of HMM in this context. The
+fifth section deals with how device memory is represented inside the kernel.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
+
+.. contents:: :local:
+
+Problems of using a device specific memory allocator
+====================================================
+
+Devices with a large amount of on board memory (several gigabytes) like GPUs
+have historically managed their memory through dedicated driver specific APIs.
+This creates a disconnect between memory allocated and managed by a device
+driver and regular application memory (private anonymous, shared memory, or
+regular file backed memory). From here on I will refer to this aspect as split
+address space. I use shared address space to refer to the opposite situation:
+i.e., one in which any application memory region can be used by a device
+transparently.
+
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
+are not equal from the device point of view which complicates large programs
+that rely on a wide set of libraries.
+
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
+share) and memory allocated through the device driver API (this still ends up
+with an mmap but of the device file).
+
+For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
+complex data set needs to re-map all the pointer relations between each of its
+elements. This is error prone and programs get harder to debug because of the
+duplicate data set and addresses.
+
+Split address space also means that libraries cannot transparently use data
+they are getting from the core program or another library and thus each library
+might have to duplicate its input data set using the device specific memory
+allocator. Large projects suffer from this and waste resources because of the
+various memory copies.
+
+Duplicating each library API to accept as input or output memory allocated by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosion in the library entry points.
+
+Finally, with the advance of high level language constructs (in C++ but in
+other languages too) it is now possible for the compiler to leverage GPUs and
+other devices without programmer knowledge. Some compiler identified patterns
+are only do-able with a shared address space. It is also more reasonable to use
+a shared address space for all other patterns.
+
+
+I/O bus, device memory characteristics
+======================================
+
+I/O buses cripple shared address spaces due to a few limitations. Most I/O
+buses only allow basic memory access from device to main memory; even cache
+coherency is often optional. Access to device memory from a CPU is even more
+limited. More often than not, it is not cache coherent.
+
+If we only consider the PCIE bus, then a device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However, it only allows
+a limited set of atomic operations from the device on main memory. This is worse
+in the other direction: the CPU can only access a limited range of the device
+memory and cannot perform atomic operations on it. Thus device memory cannot
+be considered the same as regular memory from the kernel point of view.
+
+Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
+and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
+The final limitation is latency. Access to main memory from the device has an
+order of magnitude higher latency than when the device accesses its own memory.
+
+Some platforms are developing new I/O buses or additions/modifications to PCIE
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
+architecture supports. Sadly, not all platforms are following this trend and
+some major architectures are left without hardware solutions to these problems.
+
+So for shared address space to make sense, not only must we allow devices to
+access any memory but we must also permit any memory to be migrated to device
+memory while the device is using it (blocking CPU access while it happens).
+
+
+Shared address space and migration
+==================================
+
+HMM intends to provide two main features. The first one is to share the address
+space by duplicating the CPU page table in the device page table so the same
+address points to the same physical memory for any valid main memory address in
+the process address space.
+
+To achieve this, HMM offers a set of helpers to populate the device page table
+while keeping track of CPU page table updates. Device page table updates are
+not as easy as CPU page table updates. To update the device page table, you must
+allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
+specific commands in it to perform the update (unmap, cache invalidations, and
+flush, ...). This cannot be done through common code for all devices. Hence
+why HMM provides helpers to factor out everything that can be while leaving the
+hardware specific details to the device driver.
+
+The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
+allows allocating a struct page for each page of device memory. Those pages
+are special because the CPU cannot map them. However, they allow migrating
+main memory to device memory using existing migration mechanisms and everything
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+memory for the device memory and second to perform migration. Policy decisions
+of what and when to migrate is left to the device driver.
+
+Note that any CPU access to a device page triggers a page fault and a migration
+back to main memory. For example, when a page backing a given CPU address A is
+migrated from a main memory page to a device page, then any CPU access to
+address A triggers a page fault and initiates a migration back to main memory.
+
+With these two features, HMM not only allows a device to mirror process address
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
+used by the device.
+
+
+Address space mirroring implementation and API
+==============================================
+
+Address space mirroring's main objective is to allow duplication of a range of
+CPU page table into a device page table; HMM helps keep both synchronized. A
+device driver that wants to mirror a process address space must start with the
+registration of a mmu_interval_notifier::
+
+ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+				  struct mm_struct *mm, unsigned long start,
+				  unsigned long length,
+				  const struct mmu_interval_notifier_ops *ops);
+
+During the ops->invalidate() callback the device driver must perform the
+update action to the range (mark range read only, or fully unmap, etc.). The
+device must complete the update before the driver callback returns.
+
+When the device driver wants to populate a range of virtual addresses, it can
+use::
+
+  int hmm_range_fault(struct hmm_range *range);
+
+It will trigger a page fault on missing or read-only entries if write access is
+requested (see below). Page faults use the generic mm page fault code path just
+like a CPU page fault.
+
+Both functions copy CPU page table entries into their pfns array argument. Each
+entry in that array corresponds to an address in the virtual range. HMM
+provides a set of flags to help the driver identify special CPU page table
+entries.
+
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
+
+ int driver_populate_range(...)
+ {
+      struct hmm_range range;
+      ...
+
+      range.notifier = &interval_sub;
+      range.start = ...;
+      range.end = ...;
+      range.hmm_pfns = ...;
+
+      if (!mmget_not_zero(interval_sub->notifier.mm))
+          return -EFAULT;
+
+ again:
+      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+      mmap_read_lock(mm);
+      ret = hmm_range_fault(&range);
+      if (ret) {
+          mmap_read_unlock(mm);
+          if (ret == -EBUSY)
+                 goto again;
+          return ret;
+      }
+      mmap_read_unlock(mm);
+
+      take_lock(driver->update);
+      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      /* Use pfns array content to update device page table,
+       * under the update lock */
+
+      release_lock(driver->update);
+      return 0;
+ }
+
+The driver->update lock is the same lock that the driver takes inside its
+invalidate() callback. That lock must be held before calling
+mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
+update.
+
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance if the device driver wants pages for a range with at least read
+permission, it sets::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all pages
+in the range with at least read permission.
+
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
+
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission i.e., if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+After hmm_range_fault completes the flag bits are set to the current state of
+the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
+writable.
+
+
+Represent and manage device memory from core kernel point of view
+=================================================================
+
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
+addresses that were backed by device memory. It turns out that this ended up
+replicating most of the fields of struct page and also needed many kernel code
+paths to be updated to understand this new kind of memory.
+
+Most kernel code paths never try to access the memory behind a page
+but only care about struct page contents. Because of this, HMM switched to
+directly using struct page for device memory which left most kernel code paths
+unaware of the difference. We only need to make sure that no one ever tries to
+map those pages from the CPU side.
+
+Migration to and from device memory
+===================================
+
+Because the CPU cannot access device memory directly, the device driver must
+use hardware DMA or device specific load/store instructions to migrate data.
+The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
+functions are designed to make drivers easier to write and to centralize common
+code across drivers.
+
+Before migrating pages to device private memory, special device private
+``struct page`` need to be created. These will be used as special "swap"
+page table entries so that a CPU process will fault if it tries to access
+a page that has been migrated to device private memory.
+
+These can be allocated and freed with::
+
+    struct resource *res;
+    struct dev_pagemap pagemap;
+
+    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
+                                  "name of driver resource");
+    pagemap.type = MEMORY_DEVICE_PRIVATE;
+    pagemap.range.start = res->start;
+    pagemap.range.end = res->end;
+    pagemap.nr_range = 1;
+    pagemap.ops = &device_devmem_ops;
+    memremap_pages(&pagemap, numa_node_id());
+
+    memunmap_pages(&pagemap);
+    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
+
+There are also devm_request_free_mem_region(), devm_memremap_pages(),
+devm_memunmap_pages(), and devm_release_mem_region() when the resources can
+be tied to a ``struct device``.
+
+The overall migration steps are similar to migrating NUMA pages within system
+memory (see :ref:`Page migration <page_migration>`) but the steps are split
+between device driver specific code and shared common code:
+
+1. ``mmap_read_lock()``
+
+   The device driver has to pass a ``struct vm_area_struct`` to
+   migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to
+   be held for the duration of the migration.
+
+2. ``migrate_vma_setup(struct migrate_vma *args)``
+
+   The device driver initializes the ``struct migrate_vma`` fields and passes
+   the pointer to migrate_vma_setup(). The ``args->flags`` field is used to
+   filter which source pages should be migrated. For example, setting
+   ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and
+   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in
+   device private memory. If the latter flag is set, the ``args->pgmap_owner``
+   field is used to identify device private pages owned by the driver. This
+   avoids trying to migrate device private pages residing in other devices.
+   Currently only anonymous private VMA ranges can be migrated to or from
+   system memory and device private memory.
+
+   One of the first steps migrate_vma_setup() does is to invalidate other
+   device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and
+   ``mmu_notifier_invalidate_range_end()`` calls around the page table
+   walks to fill in the ``args->src`` array with PFNs to be migrated.
+   The ``invalidate_range_start()`` callback is passed a
+   ``struct mmu_notifier_range`` with the ``event`` field set to
+   ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
+   the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
+   allows the device driver to skip the invalidation callback and only
+   invalidate device private MMU mappings that are actually migrating.
+   This is explained more in the next section.
+
+   While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()``
+   entry results in a valid "zero" PFN stored in the ``args->src`` array.
+   This lets the driver allocate device private memory and clear it instead
+   of copying a page of zeros. Valid PTE entries to system memory or
+   device private struct pages will be locked with ``lock_page()``, isolated
+   from the LRU (if system memory since device private pages are not on
+   the LRU), unmapped from the process, and a special migration PTE is
+   inserted in place of the original PTE.
+   migrate_vma_setup() also clears the ``args->dst`` array.
+
+3. The device driver allocates destination pages and copies source pages to
+   destination pages.
+
+   The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE``
+   bit is set and skips entries that are not migrating. The device driver
+   can also choose to skip migrating a page by not filling in the ``dst``
+   array for that page.
+
+   The driver then allocates either a device private struct page or a
+   system memory page, locks the page with ``lock_page()``, and fills in the
+   ``dst`` array entry with::
+
+     dst[i] = migrate_pfn(page_to_pfn(dpage));
+
+   Now that the driver knows that this page is being migrated, it can
+   invalidate device private MMU mappings and copy device private memory
+   to system memory or another device private page. The core Linux kernel
+   handles CPU page table invalidations so the device driver only has to
+   invalidate its own MMU mappings.
+
+   The driver can use ``migrate_pfn_to_page(src[i])`` to get the
+   ``struct page`` of the source and either copy the source page to the
+   destination or clear the destination device private memory if the pointer
+   is ``NULL`` meaning the source page was not populated in system memory.
+
+4. ``migrate_vma_pages()``
+
+   This step is where the migration is actually "committed".
+
+   If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this
+   is where the newly allocated page is inserted into the CPU's page table.
+   This can fail if a CPU thread faults on the same page. However, the page
+   table is locked and only one of the new pages will be inserted.
+   The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared
+   if it loses the race.
+
+   If the source page was locked, isolated, etc. the source ``struct page``
+   information is now copied to destination ``struct page`` finalizing the
+   migration on the CPU side.
+
+5. Device driver updates device MMU page tables for pages still migrating,
+   rolling back pages not migrating.
+
+   If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device
+   driver can update the device MMU and set the write enable bit if the
+   ``MIGRATE_PFN_WRITE`` bit is set.
+
+6. ``migrate_vma_finalize()``
+
+   This step replaces the special migration page table entry with the new
+   page's page table entry and releases the reference to the source and
+   destination ``struct page``.
+
+7. ``mmap_read_unlock()``
+
+   The lock can now be released.
+
+Exclusive access memory
+=======================
+
+Some devices have features such as atomic PTE bits that can be used to implement
+atomic access to system memory. To support atomic operations to a shared virtual
+memory page such a device needs access to that page which is exclusive of any
+userspace access from the CPU. The ``make_device_exclusive_range()`` function
+can be used to make a memory range inaccessible from userspace.
+
+This replaces all mappings for pages in the given range with special swap
+entries. Any attempt to access the swap entry results in a fault which is
+resovled by replacing the entry with the original mapping. A driver gets
+notified that the mapping has been changed by MMU notifiers, after which point
+it will no longer have exclusive access to the page. Exclusive access is
+guranteed to last until the driver drops the page lock and page reference, at
+which point any CPU faults on the page may proceed as described.
+
+Memory cgroup (memcg) and rss accounting
+========================================
+
+For now, device memory is accounted as any regular page in rss counters (either
+anonymous if device page is used for anonymous, file if device page is used for
+file backed page, or shmem if device page is used for shared memory). This is a
+deliberate choice to keep existing applications, that might start using device
+memory without knowing about it, running unimpacted.
+
+A drawback is that the OOM killer might kill an application using a lot of
+device memory and not a lot of regular system memory and thus not freeing much
+system memory. We want to gather more real world experience on how applications
+and system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory pages are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory cannot fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is used and its impact on memory
+resource control.
+
+
+Note that device memory can never be pinned by a device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is dropped in case of shared memory or file backed memory.
diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..f143954e0d05
--- /dev/null
+++ b/Documentation/mm/hugetlbfs_reserv.rst
@@ -0,0 +1,596 @@
+.. _hugetlbfs_reserve:
+
+=====================
+Hugetlbfs Reservation
+=====================
+
+Overview
+========
+
+Huge pages as described at :ref:`hugetlbpage` are typically
+preallocated for application use.  These huge pages are instantiated in a
+task's address space at page fault time if the VMA indicates huge pages are
+to be used.  If no huge page exists at page fault time, the task is sent
+a SIGBUS and often dies an unhappy death.  Shortly after huge page support
+was added, it was determined that it would be better to detect a shortage
+of huge pages at mmap() time.  The idea is that if there were not enough
+huge pages to cover the mapping, the mmap() would fail.  This was first
+done with a simple check in the code at mmap() time to determine if there
+were enough free huge pages to cover the mapping.  Like most things in the
+kernel, the code has evolved over time.  However, the basic idea was to
+'reserve' huge pages at mmap() time to ensure that huge pages would be
+available for page faults in that mapping.  The description below attempts to
+describe how huge page reserve processing is done in the v4.10 kernel.
+
+
+Audience
+========
+This description is primarily targeted at kernel developers who are modifying
+hugetlbfs code.
+
+
+The Data Structures
+===================
+
+resv_huge_pages
+	This is a global (per-hstate) count of reserved huge pages.  Reserved
+	huge pages are only available to the task which reserved them.
+	Therefore, the number of huge pages generally available is computed
+	as (``free_huge_pages - resv_huge_pages``).
+Reserve Map
+	A reserve map is described by the structure::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
+	There is one reserve map for each huge page mapping in the system.
+	The regions list within the resv_map describes the regions within
+	the mapping.  A region is described as::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
+	The 'from' and 'to' fields of the file region structure are huge page
+	indices into the mapping.  Depending on the type of mapping, a
+	region in the reserv_map may indicate reservations exist for the
+	range, or reservations do not exist.
+Flags for MAP_PRIVATE Reservations
+	These are stored in the bottom bits of the reservation map pointer.
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		Indicates this task is the owner of the reservations
+		associated with the mapping.
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		Indicates task originally mapping this range (and creating
+		reserves) has unmapped a page from this task (the child)
+		due to a failed COW.
+Page Flags
+	The PagePrivate page flag is used to indicate that a huge page
+	reservation must be restored when the huge page is freed.  More
+	details will be discussed in the "Freeing huge pages" section.
+
+
+Reservation Map Location (Private or Shared)
+============================================
+
+A huge page mapping or segment is either private or shared.  If private,
+it is typically only available to a single address space (task).  If shared,
+it can be mapped into multiple address spaces (tasks).  The location and
+semantics of the reservation map is significantly different for the two types
+of mappings.  Location differences are:
+
+- For private mappings, the reservation map hangs off the VMA structure.
+  Specifically, vma->vm_private_data.  This reserve map is created at the
+  time the mapping (mmap(MAP_PRIVATE)) is created.
+- For shared mappings, the reservation map hangs off the inode.  Specifically,
+  inode->i_mapping->private_data.  Since shared mappings are always backed
+  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
+  contains a reservation map.  As a result, the reservation map is allocated
+  when the inode is created.
+
+
+Creating Reservations
+=====================
+Reservations are created when a huge page backed shared memory segment is
+created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
+These operations result in a call to the routine hugetlb_reserve_pages()::
+
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
+
+The first thing hugetlb_reserve_pages() does is check if the NORESERVE
+flag was specified in either the shmget() or mmap() call.  If NORESERVE
+was specified, then this routine returns immediately as no reservations
+are desired.
+
+The arguments 'from' and 'to' are huge page indices into the mapping or
+underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
+the length of the segment/mapping.  For mmap(), the offset argument could
+be used to specify the offset into the underlying file.  In such a case,
+the 'from' and 'to' arguments have been adjusted by this offset.
+
+One of the big differences between PRIVATE and SHARED mappings is the way
+in which reservations are represented in the reservation map.
+
+- For shared mappings, an entry in the reservation map indicates a reservation
+  exists or did exist for the corresponding page.  As reservations are
+  consumed, the reservation map is not modified.
+- For private mappings, the lack of an entry in the reservation map indicates
+  a reservation exists for the corresponding page.  As reservations are
+  consumed, entries are added to the reservation map.  Therefore, the
+  reservation map can also be used to determine which reservations have
+  been consumed.
+
+For private mappings, hugetlb_reserve_pages() creates the reservation map and
+hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
+to indicate this VMA owns the reservations.
+
+The reservation map is consulted to determine how many huge page reservations
+are needed for the current mapping/segment.  For private mappings, this is
+always the value (to - from).  However, for shared mappings it is possible that
+some reservations may already exist within the range (to - from).  See the
+section :ref:`Reservation Map Modifications <resv_map_modifications>`
+for details on how this is accomplished.
+
+The mapping may be associated with a subpool.  If so, the subpool is consulted
+to ensure there is sufficient space for the mapping.  It is possible that the
+subpool has set aside reservations that can be used for the mapping.  See the
+section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
+
+After consulting the reservation map and subpool, the number of needed new
+reservations is known.  The routine hugetlb_acct_memory() is called to check
+for and take the requested number of reservations.  hugetlb_acct_memory()
+calls into routines that potentially allocate and adjust surplus page counts.
+However, within those routines the code is simply checking to ensure there
+are enough free huge pages to accommodate the reservation.  If there are,
+the global reservation count resv_huge_pages is adjusted something like the
+following::
+
+	if (resv_needed <= (resv_huge_pages - free_huge_pages))
+		resv_huge_pages += resv_needed;
+
+Note that the global lock hugetlb_lock is held when checking and adjusting
+these counters.
+
+If there were enough free huge pages and the global count resv_huge_pages
+was adjusted, then the reservation map associated with the mapping is
+modified to reflect the reservations.  In the case of a shared mapping, a
+file_region will exist that includes the range 'from' - 'to'.  For private
+mappings, no modifications are made to the reservation map as lack of an
+entry indicates a reservation exists.
+
+If hugetlb_reserve_pages() was successful, the global reservation count and
+reservation map associated with the mapping will be modified as required to
+ensure reservations exist for the range 'from' - 'to'.
+
+.. _consume_resv:
+
+Consuming Reservations/Allocating a Huge Page
+=============================================
+
+Reservations are consumed when huge pages associated with the reservations
+are allocated and instantiated in the corresponding mapping.  The allocation
+is performed within the routine alloc_huge_page()::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
+alloc_huge_page is passed a VMA pointer and a virtual address, so it can
+consult the reservation map to determine if a reservation exists.  In addition,
+alloc_huge_page takes the argument avoid_reserve which indicates reserves
+should not be used even if it appears they have been set aside for the
+specified address.  The avoid_reserve argument is most often used in the case
+of Copy on Write and Page Migration where additional copies of an existing
+page are being allocated.
+
+The helper routine vma_needs_reservation() is called to determine if a
+reservation exists for the address within the mapping(vma).  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
+information on what this routine does.
+The value returned from vma_needs_reservation() is generally
+0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
+If a reservation does not exist, and there is a subpool associated with the
+mapping the subpool is consulted to determine if it contains reservations.
+If the subpool contains reservations, one can be used for this allocation.
+However, in every case the avoid_reserve argument overrides the use of
+a reservation for the allocation.  After determining whether a reservation
+exists and can be used for the allocation, the routine dequeue_huge_page_vma()
+is called.  This routine takes two arguments related to reservations:
+
+- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
+- chg, even though this argument is of type long only the values 0 or 1 are
+  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
+  reservation exists (see the section "Memory Policy and Reservations" for
+  possible issues).  If the value is 1, it indicates a reservation does not
+  exist and the page must be taken from the global free pool if possible.
+
+The free lists associated with the memory policy of the VMA are searched for
+a free page.  If a page is found, the value free_huge_pages is decremented
+when the page is removed from the free list.  If there was a reservation
+associated with the page, the following adjustments are made::
+
+	SetPagePrivate(page);	/* Indicates allocating this page consumed
+				 * a reservation, and if an error is
+				 * encountered such that the page must be
+				 * freed, the reservation will be restored. */
+	resv_huge_pages--;	/* Decrement the global reservation count */
+
+Note, if no huge page can be found that satisfies the VMA's memory policy
+an attempt will be made to allocate one using the buddy allocator.  This
+brings up the issue of surplus huge pages and overcommit which is beyond
+the scope reservations.  Even if a surplus page is allocated, the same
+reservation based adjustments as above will be made: SetPagePrivate(page) and
+resv_huge_pages--.
+
+After obtaining a new huge page, (page)->private is set to the value of
+the subpool associated with the page if it exists.  This will be used for
+subpool accounting when the page is freed.
+
+The routine vma_commit_reservation() is then called to adjust the reserve
+map based on the consumption of the reservation.  In general, this involves
+ensuring the page is represented within a file_region structure of the region
+map.  For shared mappings where the reservation was present, an entry
+in the reserve map already existed so no change is made.  However, if there
+was no reservation in a shared mapping or this was a private mapping a new
+entry must be created.
+
+It is possible that the reserve map could have been changed between the call
+to vma_needs_reservation() at the beginning of alloc_huge_page() and the
+call to vma_commit_reservation() after the page was allocated.  This would
+be possible if hugetlb_reserve_pages was called for the same page in a shared
+mapping.  In such cases, the reservation count and subpool free page count
+will be off by one.  This rare condition can be identified by comparing the
+return value from vma_needs_reservation and vma_commit_reservation.  If such
+a race is detected, the subpool and global reserve counts are adjusted to
+compensate.  See the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
+information on these routines.
+
+
+Instantiate Huge Pages
+======================
+
+After huge page allocation, the page is typically added to the page tables
+of the allocating task.  Before this, pages in a shared mapping are added
+to the page cache and pages in private mappings are added to an anonymous
+reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
+when a huge page that has been instantiated is freed no adjustment is made
+to the global reservation count (resv_huge_pages).
+
+
+Freeing Huge Pages
+==================
+
+Huge page freeing is performed by the routine free_huge_page().  This routine
+is the destructor for hugetlbfs compound pages.  As a result, it is only
+passed a pointer to the page struct.  When a huge page is freed, reservation
+accounting may need to be performed.  This would be the case if the page was
+associated with a subpool that contained reserves, or the page is being freed
+on an error path where a global reserve count must be restored.
+
+The page->private field points to any subpool associated with the page.
+If the PagePrivate flag is set, it indicates the global reserve count should
+be adjusted (see the section
+:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
+for information on how these are set).
+
+The routine first calls hugepage_subpool_put_pages() for the page.  If this
+routine returns a value of 0 (which does not equal the value passed 1) it
+indicates reserves are associated with the subpool, and this newly free page
+must be used to keep the number of subpool reserves above the minimum size.
+Therefore, the global resv_huge_pages counter is incremented in this case.
+
+If the PagePrivate flag was set in the page, the global resv_huge_pages counter
+will always be incremented.
+
+.. _sub_pool_resv:
+
+Subpool Reservations
+====================
+
+There is a struct hstate associated with each huge page size.  The hstate
+tracks all huge pages of the specified size.  A subpool represents a subset
+of pages within a hstate that is associated with a mounted hugetlbfs
+filesystem.
+
+When a hugetlbfs filesystem is mounted a min_size option can be specified
+which indicates the minimum number of huge pages required by the filesystem.
+If this option is specified, the number of huge pages corresponding to
+min_size are reserved for use by the filesystem.  This number is tracked in
+the min_hpages field of a struct hugepage_subpool.  At mount time,
+hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
+huge pages.  If they can not be reserved, the mount fails.
+
+The routines hugepage_subpool_get/put_pages() are called when pages are
+obtained from or released back to a subpool.  They perform all subpool
+accounting, and track any reservations associated with the subpool.
+hugepage_subpool_get/put_pages are passed the number of huge pages by which
+to adjust the subpool 'used page' count (down for get, up for put).  Normally,
+they return the same value that was passed or an error if not enough pages
+exist in the subpool.
+
+However, if reserves are associated with the subpool a return value less
+than the passed value may be returned.  This return value indicates the
+number of additional global pool adjustments which must be made.  For example,
+suppose a subpool contains 3 reserved huge pages and someone asks for 5.
+The 3 reserved pages associated with the subpool can be used to satisfy part
+of the request.  But, 2 pages must be obtained from the global pools.  To
+relay this information to the caller, the value 2 is returned.  The caller
+is then responsible for attempting to obtain the additional two pages from
+the global pools.
+
+
+COW and Reservations
+====================
+
+Since shared mappings all point to and use the same underlying pages, the
+biggest reservation concern for COW is private mappings.  In this case,
+two tasks can be pointing at the same previously allocated page.  One task
+attempts to write to the page, so a new page must be allocated so that each
+task points to its own page.
+
+When the page was originally allocated, the reservation for that page was
+consumed.  When an attempt to allocate a new page is made as a result of
+COW, it is possible that no free huge pages are free and the allocation
+will fail.
+
+When the private mapping was originally created, the owner of the mapping
+was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
+map of the owner.  Since the owner created the mapping, the owner owns all
+the reservations associated with the mapping.  Therefore, when a write fault
+occurs and there is no page available, different action is taken for the owner
+and non-owner of the reservation.
+
+In the case where the faulting task is not the owner, the fault will fail and
+the task will typically receive a SIGBUS.
+
+If the owner is the faulting task, we want it to succeed since it owned the
+original reservation.  To accomplish this, the page is unmapped from the
+non-owning task.  In this way, the only reference is from the owning task.
+In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
+of the non-owning task.  The non-owning task may receive a SIGBUS if it later
+faults on a non-present page.  But, the original owner of the
+mapping/reservation will behave as expected.
+
+
+.. _resv_map_modifications:
+
+Reservation Map Modifications
+=============================
+
+The following low level routines are used to make modifications to a
+reservation map.  Typically, these routines are not called directly.  Rather,
+a reservation map helper routine is called which calls one of these low level
+routines.  These low level routines are fairly well documented in the source
+code (mm/hugetlb.c).  These routines are::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
+
+Operations on the reservation map typically involve two operations:
+
+1) region_chg() is called to examine the reserve map and determine how
+   many pages in the specified range [f, t) are NOT currently represented.
+
+   The calling code performs global checks and allocations to determine if
+   there are enough huge pages for the operation to succeed.
+
+2)
+  a) If the operation can succeed, region_add() is called to actually modify
+     the reservation map for the same range [f, t) previously passed to
+     region_chg().
+  b) If the operation can not succeed, region_abort is called for the same
+     range [f, t) to abort the operation.
+
+Note that this is a two step process where region_add() and region_abort()
+are guaranteed to succeed after a prior call to region_chg() for the same
+range.  region_chg() is responsible for pre-allocating any data structures
+necessary to ensure the subsequent operations (specifically region_add()))
+will succeed.
+
+As mentioned above, region_chg() determines the number of pages in the range
+which are NOT currently represented in the map.  This number is returned to
+the caller.  region_add() returns the number of pages in the range added to
+the map.  In most cases, the return value of region_add() is the same as the
+return value of region_chg().  However, in the case of shared mappings it is
+possible for changes to the reservation map to be made between the calls to
+region_chg() and region_add().  In this case, the return value of region_add()
+will not match the return value of region_chg().  It is likely that in such
+cases global counts and subpool accounting will be incorrect and in need of
+adjustment.  It is the responsibility of the caller to check for this condition
+and make the appropriate adjustments.
+
+The routine region_del() is called to remove regions from a reservation map.
+It is typically called in the following situations:
+
+- When a file in the hugetlbfs filesystem is being removed, the inode will
+  be released and the reservation map freed.  Before freeing the reservation
+  map, all the individual file_region structures must be freed.  In this case
+  region_del is passed the range [0, LONG_MAX).
+- When a hugetlbfs file is being truncated.  In this case, all allocated pages
+  after the new file size must be freed.  In addition, any file_region entries
+  in the reservation map past the new end of file must be deleted.  In this
+  case, region_del is passed the range [new_end_of_file, LONG_MAX).
+- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
+  are removed from the middle of the file one at a time.  As the pages are
+  removed, region_del() is called to remove the corresponding entry from the
+  reservation map.  In this case, region_del is passed the range
+  [page_idx, page_idx + 1).
+
+In every case, region_del() will return the number of pages removed from the
+reservation map.  In VERY rare cases, region_del() can fail.  This can only
+happen in the hole punch case where it has to split an existing file_region
+entry and can not allocate a new structure.  In this error case, region_del()
+will return -ENOMEM.  The problem here is that the reservation map will
+indicate that there is a reservation for the page.  However, the subpool and
+global reservation counts will not reflect the reservation.  To handle this
+situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
+counters so that they correspond with the reservation map entry that could
+not be deleted.
+
+region_count() is called when unmapping a private huge page mapping.  In
+private mappings, the lack of a entry in the reservation map indicates that
+a reservation exists.  Therefore, by counting the number of entries in the
+reservation map we know how many reservations were consumed and how many are
+outstanding (outstanding = (end - start) - region_count(resv, start, end)).
+Since the mapping is going away, the subpool and global reservation counts
+are decremented by the number of outstanding reservations.
+
+.. _resv_map_helpers:
+
+Reservation Map Helper Routines
+===============================
+
+Several helper routines exist to query and modify the reservation maps.
+These routines are only interested with reservations for a specific huge
+page, so they just pass in an address instead of a range.  In addition,
+they pass in the associated VMA.  From the VMA, the type of mapping (private
+or shared) and the location of the reservation map (inode or VMA) can be
+determined.  These routines simply call the underlying routines described
+in the section "Reservation Map Modifications".  However, they do take into
+account the 'opposite' meaning of reservation map entries for private and
+shared mappings and hide this detail from the caller::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
+
+This routine calls region_chg() for the specified page.  If no reservation
+exists, 1 is returned.  If a reservation exists, 0 is returned::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
+
+This calls region_add() for the specified page.  As in the case of region_chg
+and region_add, this routine is to be called after a previous call to
+vma_needs_reservation.  It will add a reservation entry for the page.  It
+returns 1 if the reservation was added and 0 if not.  The return value should
+be compared with the return value of the previous call to
+vma_needs_reservation.  An unexpected difference indicates the reservation
+map was modified between calls::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This calls region_abort() for the specified page.  As in the case of region_chg
+and region_abort, this routine is to be called after a previous call to
+vma_needs_reservation.  It will abort/end the in progress reservation add
+operation::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+This is a special wrapper routine to help facilitate reservation cleanup
+on error paths.  It is only called from the routine restore_reserve_on_error().
+This routine is used in conjunction with vma_needs_reservation in an attempt
+to add a reservation to the reservation map.  It takes into account the
+different reservation map semantics for private and shared mappings.  Hence,
+region_add is called for shared mappings (as an entry present in the map
+indicates a reservation), and region_del is called for private mappings (as
+the absence of an entry in the map indicates a reservation).  See the section
+"Reservation cleanup in error paths" for more information on what needs to
+be done on error paths.
+
+
+Reservation Cleanup in Error Paths
+==================================
+
+As mentioned in the section
+:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
+map modifications are performed in two steps.  First vma_needs_reservation
+is called before a page is allocated.  If the allocation is successful,
+then vma_commit_reservation is called.  If not, vma_end_reservation is called.
+Global and subpool reservation counts are adjusted based on success or failure
+of the operation and all is well.
+
+Additionally, after a huge page is instantiated the PagePrivate flag is
+cleared so that accounting when the page is ultimately freed is correct.
+
+However, there are several instances where errors are encountered after a huge
+page is allocated but before it is instantiated.  In this case, the page
+allocation has consumed the reservation and made the appropriate subpool,
+reservation map and global count adjustments.  If the page is freed at this
+time (before instantiation and clearing of PagePrivate), then free_huge_page
+will increment the global reservation count.  However, the reservation map
+indicates the reservation was consumed.  This resulting inconsistent state
+will cause the 'leak' of a reserved huge page.  The global reserve count will
+be  higher than it should and prevent allocation of a pre-allocated page.
+
+The routine restore_reserve_on_error() attempts to handle this situation.  It
+is fairly well documented.  The intention of this routine is to restore
+the reservation map to the way it was before the page allocation.   In this
+way, the state of the reservation map will correspond to the global reservation
+count after the page is freed.
+
+The routine restore_reserve_on_error itself may encounter errors while
+attempting to restore the reservation map entry.  In this case, it will
+simply clear the PagePrivate flag of the page.  In this way, the global
+reserve count will not be incremented when the page is freed.  However, the
+reservation map will continue to look as though the reservation was consumed.
+A page can still be allocated for the address, but it will not use a reserved
+page as originally intended.
+
+There is some code (most notably userfaultfd) which can not call
+restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
+so that a reservation will not be leaked when the huge page is freed.
+
+
+Reservations and Memory Policy
+==============================
+Per-node huge page lists existed in struct hstate when git was first used
+to manage Linux code.  The concept of reservations was added some time later.
+When reservations were added, no attempt was made to take memory policy
+into account.  While cpusets are not exactly the same as memory policy, this
+comment in hugetlb_acct_memory sums up the interaction between reservations
+and cpusets/memory policy::
+
+	/*
+	 * When cpuset is configured, it breaks the strict hugetlb page
+	 * reservation as the accounting is done on a global variable. Such
+	 * reservation is completely rubbish in the presence of cpuset because
+	 * the reservation is not checked against page availability for the
+	 * current cpuset. Application can still potentially OOM'ed by kernel
+	 * with lack of free htlb page in cpuset that the task is in.
+	 * Attempt to enforce strict accounting with cpuset is almost
+	 * impossible (or too ugly) because cpuset is too fluid that
+	 * task or memory node can be dynamically moved between cpusets.
+	 *
+	 * The change of semantics for shared hugetlb mapping with cpuset is
+	 * undesirable. However, in order to preserve some of the semantics,
+	 * we fall back to check against current free page availability as
+	 * a best attempt and hopefully to minimize the impact of changing
+	 * semantics that cpuset has.
+	 */
+
+Huge page reservations were added to prevent unexpected page allocation
+failures (OOM) at page fault time.  However, if an application makes use
+of cpusets or memory policy there is no guarantee that huge pages will be
+available on the required nodes.  This is true even if there are a sufficient
+number of global reservations.
+
+Hugetlbfs regression testing
+============================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions.  In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
+
+--
+Mike Kravetz, 7 April 2017
diff --git a/Documentation/mm/hwpoison.rst b/Documentation/mm/hwpoison.rst
new file mode 100644
index 000000000000..b9d5253c1305
--- /dev/null
+++ b/Documentation/mm/hwpoison.rst
@@ -0,0 +1,184 @@
+.. hwpoison:
+
+========
+hwpoison
+========
+
+What is hwpoison?
+=================
+
+Upcoming Intel CPUs have support for recovering from some memory errors
+(``MCA recovery``). This requires the OS to declare a page "poisoned",
+kill the processes associated with it and avoid using it in the future.
+
+This patchkit implements the necessary infrastructure in the VM.
+
+To quote the overview comment::
+
+	High level machine check handler. Handles pages reported by the
+	hardware as being corrupted usually due to a 2bit ECC memory or cache
+	failure.
+
+	This focusses on pages detected as corrupted in the background.
+	When the current CPU tries to consume corruption the currently
+	running process can just be killed directly instead. This implies
+	that if the error cannot be handled for some reason it's safe to
+	just ignore it because no corruption has been consumed yet. Instead
+	when that happens another machine check will happen.
+
+	Handles page cache pages in various states. The tricky part
+	here is that we can access any page asynchronous to other VM
+	users, because memory failures could happen anytime and anywhere,
+	possibly violating some of their assumptions. This is why this code
+	has to be extremely careful. Generally it tries to use normal locking
+	rules, as in get the standard locks, even if that means the
+	error handling takes potentially a long time.
+
+	Some of the operations here are somewhat inefficient and have non
+	linear algorithmic complexity, because the data structures have not
+	been optimized for this case. This is in particular the case
+	for the mapping from a vma to a process. Since this case is expected
+	to be rare we hope we can get away with this.
+
+The code consists of a the high level handler in mm/memory-failure.c,
+a new page poison bit and various checks in the VM to handle poisoned
+pages.
+
+The main target right now is KVM guests, but it works for all kinds
+of applications. KVM support requires a recent qemu-kvm release.
+
+For the KVM use there was need for a new signal type so that
+KVM can inject the machine check into the guest with the proper
+address. This in theory allows other applications to handle
+memory failures too. The expection is that near all applications
+won't do that, but some very specialized ones might.
+
+Failure recovery modes
+======================
+
+There are two (actually three) modes memory failure recovery can be in:
+
+vm.memory_failure_recovery sysctl set to zero:
+	All memory failures cause a panic. Do not attempt recovery.
+
+early kill
+	(can be controlled globally and per process)
+	Send SIGBUS to the application as soon as the error is detected
+	This allows applications who can process memory errors in a gentle
+	way (e.g. drop affected object)
+	This is the mode used by KVM qemu.
+
+late kill
+	Send SIGBUS when the application runs into the corrupted page.
+	This is best for memory error unaware applications and default
+	Note some pages are always handled as late kill.
+
+User control
+============
+
+vm.memory_failure_recovery
+	See sysctl.txt
+
+vm.memory_failure_early_kill
+	Enable early kill mode globally
+
+PR_MCE_KILL
+	Set early/late kill mode/revert to system default
+
+	arg1: PR_MCE_KILL_CLEAR:
+		Revert to system default
+	arg1: PR_MCE_KILL_SET:
+		arg2 defines thread specific mode
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			Use system global default
+
+	Note that if you want to have a dedicated thread which handles
+	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
+	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
+	the SIGBUS is sent to the main thread.
+
+PR_MCE_KILL_GET
+	return current mode
+
+Testing
+=======
+
+* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
+  process for testing
+
+* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
+
+  corrupt-pfn
+	Inject hwpoison fault at PFN echoed into this file. This does
+	some early filtering to avoid corrupted unintended pages in test suites.
+
+  unpoison-pfn
+	Software-unpoison page at PFN echoed into this file. This way
+	a page can be reused again.  This only works for Linux
+	injected failures, not for real memory failures. Once any hardware
+	memory failure happens, this feature is disabled.
+
+  Note these injection interfaces are not stable and might change between
+  kernel versions
+
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	Only handle memory failures to pages associated with the file
+	system defined by block device major/minor.  -1U is the
+	wildcard value.  This should be only used for testing with
+	artificial injection.
+
+  corrupt-filter-memcg
+	Limit injection to pages owned by memgroup. Specified by inode
+	number of the memcg.
+
+	Example::
+
+		mkdir /sys/fs/cgroup/mem/hwpoison
+
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	When specified, only poison pages if ((page_flags & mask) ==
+	value).  This allows stress testing of many kinds of
+	pages. The page_flags are the same as in /proc/kpageflags. The
+	flag bits are defined in include/linux/kernel-page-flags.h and
+	documented in Documentation/admin-guide/mm/pagemap.rst
+
+* Architecture specific MCE injector
+
+  x86 has mce-inject, mce-test
+
+  Some portable hwpoison test programs in mce-test, see below.
+
+References
+==========
+
+http://halobates.de/mce-lc09-2.pdf
+	Overview presentation from LinuxCon 09
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	Test suite (hwpoison specific portable tests in tsrc)
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86 specific injector
+
+
+Limitations
+===========
+- Not all page types are supported and never will. Most kernel internal
+  objects cannot be recovered, only LRU pages for now.
+
+---
+Andi Kleen, Oct 2009
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
new file mode 100644
index 000000000000..575ccd40e30c
--- /dev/null
+++ b/Documentation/mm/index.rst
@@ -0,0 +1,68 @@
+=====================================
+Linux Memory Management Documentation
+=====================================
+
+Memory Management Guide
+=======================
+
+This is a guide to understanding the memory management subsystem
+of Linux.  If you are looking for advice on simply allocating memory,
+see the :ref:`memory_allocation`.  For controlling and tuning guides,
+see the :doc:`admin guide <../admin-guide/mm/index>`.
+
+.. toctree::
+   :maxdepth: 1
+
+   physical_memory
+   page_tables
+   process_addrs
+   bootmem
+   page_allocation
+   vmalloc
+   slab
+   highmem
+   page_reclaim
+   swap
+   page_cache
+   shmfs
+   oom
+
+Legacy Documentation
+====================
+
+This is a collection of older documents about the Linux memory management
+(MM) subsystem internals with different level of details ranging from
+notes and mailing list responses for elaborating descriptions of data
+structures and algorithms.  It should all be integrated nicely into the
+above structured documentation, or deleted if it has served its purpose.
+
+.. toctree::
+   :maxdepth: 1
+
+   active_mm
+   arch_pgtable_helpers
+   balance
+   damon/index
+   free_page_reporting
+   frontswap
+   hmm
+   hwpoison
+   hugetlbfs_reserv
+   ksm
+   memory-model
+   mmu_notifier
+   numa
+   overcommit-accounting
+   page_migration
+   page_frags
+   page_owner
+   page_table_check
+   remap_file_pages
+   slub
+   split_page_table_lock
+   transhuge
+   unevictable-lru
+   vmalloced-kernel-stacks
+   vmemmap_dedup
+   z3fold
+   zsmalloc
diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst
new file mode 100644
index 000000000000..9e37add068e6
--- /dev/null
+++ b/Documentation/mm/ksm.rst
@@ -0,0 +1,87 @@
+.. _ksm:
+
+=======================
+Kernel Samepage Merging
+=======================
+
+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
+and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/
+
+The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
+
+Design
+======
+
+Overview
+--------
+
+.. kernel-doc:: mm/ksm.c
+   :DOC: Overview
+
+Reverse mapping
+---------------
+KSM maintains reverse mapping information for KSM pages in the stable
+tree.
+
+If a KSM page is shared between less than ``max_page_sharing`` VMAs,
+the node of the stable tree that represents such KSM page points to a
+list of struct rmap_item and the ``page->mapping`` of the
+KSM page points to the stable tree node.
+
+When the sharing passes this threshold, KSM adds a second dimension to
+the stable tree. The tree node becomes a "chain" that links one or
+more "dups". Each "dup" keeps reverse mapping information for a KSM
+page with ``page->mapping`` pointing to that "dup".
+
+Every "chain" and all "dups" linked into a "chain" enforce the
+invariant that they represent the same write protected memory content,
+even if each "dup" will be pointed by a different KSM page copy of
+that content.
+
+This way the stable tree lookup computational complexity is unaffected
+if compared to an unlimited list of reverse mappings. It is still
+enforced that there cannot be KSM page content duplicates in the
+stable tree itself.
+
+The deduplication limit enforced by ``max_page_sharing`` is required
+to avoid the virtual memory rmap lists to grow too large. The rmap
+walk has O(N) complexity where N is the number of rmap_items
+(i.e. virtual mappings) that are sharing the page, which is in turn
+capped by ``max_page_sharing``. So this effectively spreads the linear
+O(N) computational complexity from rmap walk context over different
+KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
+but N is the number of stable_node "dups", not the number of
+rmap_items, so it has not a significant impact on ksmd performance. In
+practice the best stable_node "dup" candidate will be kept and found
+at the head of the "dups" list.
+
+High values of ``max_page_sharing`` result in faster memory merging
+(because there will be fewer stable_node dups queued into the
+stable_node chain->hlist to check for pruning) and higher
+deduplication factor at the expense of slower worst case for rmap
+walks for any KSM page which can happen during swapping, compaction,
+NUMA balancing and page migration.
+
+The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
+``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to free up
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
+The whole list of stable_node "dups" linked in the stable_node
+"chains" is scanned periodically in order to prune stale stable_nodes.
+The frequency of such scans is defined by
+``stable_node_chains_prune_millisecs`` sysfs tunable.
+
+Reference
+---------
+.. kernel-doc:: mm/ksm.c
+   :functions: mm_slot ksm_scan stable_node rmap_item
+
+--
+Izik Eidus,
+Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/mm/memory-model.rst b/Documentation/mm/memory-model.rst
new file mode 100644
index 000000000000..3779e562dc76
--- /dev/null
+++ b/Documentation/mm/memory-model.rst
@@ -0,0 +1,177 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _physical_memory_model:
+
+=====================
+Physical Memory Model
+=====================
+
+Physical memory in a system may be addressed in different ways. The
+simplest case is when the physical memory starts at address 0 and
+spans a contiguous range up to the maximal address. It could be,
+however, that this range contains small holes that are not accessible
+for the CPU. Then there could be several contiguous ranges at
+completely distinct addresses. And, don't forget about NUMA, where
+different memory banks are attached to different CPUs.
+
+Linux abstracts this diversity using one of the two memory models:
+FLATMEM and SPARSEMEM. Each architecture defines what
+memory models it supports, what the default memory model is and
+whether it is possible to manually override that default.
+
+All the memory models track the status of physical page frames using
+struct page arranged in one or more arrays.
+
+Regardless of the selected memory model, there exists one-to-one
+mapping between the physical page frame number (PFN) and the
+corresponding `struct page`.
+
+Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn`
+helpers that allow the conversion from PFN to `struct page` and vice
+versa.
+
+FLATMEM
+=======
+
+The simplest memory model is FLATMEM. This model is suitable for
+non-NUMA systems with contiguous, or mostly contiguous, physical
+memory.
+
+In the FLATMEM memory model, there is a global `mem_map` array that
+maps the entire physical memory. For most architectures, the holes
+have entries in the `mem_map` array. The `struct page` objects
+corresponding to the holes are never fully initialized.
+
+To allocate the `mem_map` array, architecture specific setup code should
+call :c:func:`free_area_init` function. Yet, the mappings array is not
+usable until the call to :c:func:`memblock_free_all` that hands all the
+memory to the page allocator.
+
+An architecture may free parts of the `mem_map` array that do not cover the
+actual physical pages. In such case, the architecture specific
+:c:func:`pfn_valid` implementation should take the holes in the
+`mem_map` into account.
+
+With FLATMEM, the conversion between a PFN and the `struct page` is
+straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
+`mem_map` array.
+
+The `ARCH_PFN_OFFSET` defines the first page frame number for
+systems with physical memory starting at address different from 0.
+
+SPARSEMEM
+=========
+
+SPARSEMEM is the most versatile memory model available in Linux and it
+is the only memory model that supports several advanced features such
+as hot-plug and hot-remove of the physical memory, alternative memory
+maps for non-volatile memory devices and deferred initialization of
+the memory map for larger systems.
+
+The SPARSEMEM model presents the physical memory as a collection of
+sections. A section is represented with struct mem_section
+that contains `section_mem_map` that is, logically, a pointer to an
+array of struct pages. However, it is stored with some other magic
+that aids the sections management. The section size and maximal number
+of section is specified using `SECTION_SIZE_BITS` and
+`MAX_PHYSMEM_BITS` constants defined by each architecture that
+supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a
+physical address that an architecture supports, the
+`SECTION_SIZE_BITS` is an arbitrary value.
+
+The maximal number of sections is denoted `NR_MEM_SECTIONS` and
+defined as
+
+.. math::
+
+   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
+
+The `mem_section` objects are arranged in a two-dimensional array
+called `mem_sections`. The size and placement of this array depend
+on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of
+sections:
+
+* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections`
+  array is static and has `NR_MEM_SECTIONS` rows. Each row holds a
+  single `mem_section` object.
+* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections`
+  array is dynamically allocated. Each row contains PAGE_SIZE worth of
+  `mem_section` objects and the number of rows is calculated to fit
+  all the memory sections.
+
+The architecture setup code should call sparse_init() to
+initialize the memory sections and the memory maps.
+
+With SPARSEMEM there are two possible ways to convert a PFN to the
+corresponding `struct page` - a "classic sparse" and "sparse
+vmemmap". The selection is made at build time and it is determined by
+the value of `CONFIG_SPARSEMEM_VMEMMAP`.
+
+The classic sparse encodes the section number of a page in page->flags
+and uses high bits of a PFN to access the section that maps that page
+frame. Inside a section, the PFN is the index to the array of pages.
+
+The sparse vmemmap uses a virtually mapped memory map to optimize
+pfn_to_page and page_to_pfn operations. There is a global `struct
+page *vmemmap` pointer that points to a virtually contiguous array of
+`struct page` objects. A PFN is an index to that array and the
+offset of the `struct page` from `vmemmap` is the PFN of that
+page.
+
+To use vmemmap, an architecture has to reserve a range of virtual
+addresses that will map the physical pages containing the memory
+map and make sure that `vmemmap` points to that range. In addition,
+the architecture should implement :c:func:`vmemmap_populate` method
+that will allocate the physical memory and create page tables for the
+virtual memory map. If an architecture does not have any special
+requirements for the vmemmap mappings, it can use default
+:c:func:`vmemmap_populate_basepages` provided by the generic memory
+management.
+
+The virtually mapped memory map allows storing `struct page` objects
+for persistent memory devices in pre-allocated storage on those
+devices. This storage is represented with struct vmem_altmap
+that is eventually passed to vmemmap_populate() through a long chain
+of function calls. The vmemmap_populate() implementation may use the
+`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
+allocate memory map on the persistent memory device.
+
+ZONE_DEVICE
+===========
+The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer
+`struct page` `mem_map` services for device driver identified physical
+address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact
+that the page objects for these address ranges are never marked online,
+and that a reference must be taken against the device, not just the page
+to keep the memory pinned for active use. `ZONE_DEVICE`, via
+:c:func:`devm_memremap_pages`, performs just enough memory hotplug to
+turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and
+:c:func:`get_user_pages` service for the given range of pfns. Since the
+page reference count never drops below 1 the page is never tracked as
+free memory and the page's `struct list_head lru` space is repurposed
+for back referencing to the host device / driver that mapped the memory.
+
+While `SPARSEMEM` presents memory as a collection of sections,
+optionally collected into memory blocks, `ZONE_DEVICE` users have a need
+for smaller granularity of populating the `mem_map`. Given that
+`ZONE_DEVICE` memory is never marked online it is subsequently never
+subject to its memory ranges being exposed through the sysfs memory
+hotplug api on memory block boundaries. The implementation relies on
+this lack of user-api constraint to allow sub-section sized memory
+ranges to be specified to :c:func:`arch_add_memory`, the top-half of
+memory hotplug. Sub-section support allows for 2MB as the cross-arch
+common alignment granularity for :c:func:`devm_memremap_pages`.
+
+The users of `ZONE_DEVICE` are:
+
+* pmem: Map platform persistent memory to be used as a direct-I/O target
+  via DAX mappings.
+
+* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
+  event callbacks to allow a device-driver to coordinate memory management
+  events related to device-memory, typically GPU memory. See
+  Documentation/mm/hmm.rst.
+
+* p2pdma: Create `struct page` objects to allow peer devices in a
+  PCI/-E topology to coordinate direct-DMA operations between themselves,
+  i.e. bypass host memory.
diff --git a/Documentation/mm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst
new file mode 100644
index 000000000000..df5d7777fc6b
--- /dev/null
+++ b/Documentation/mm/mmu_notifier.rst
@@ -0,0 +1,99 @@
+.. _mmu_notifier:
+
+When do you need to notify inside page table lock ?
+===================================================
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {try to write to addrA}
+ CPU-thread-1  {try to write to addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA and populate device TLB}
+ DEV-thread-2  {read addrB and populate device TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+ CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {write to addrA which is a write to new page}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {write to addrB which is a write to new page}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {read addrA from old page}
+ DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seeing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/mm/numa.rst b/Documentation/mm/numa.rst
new file mode 100644
index 000000000000..99fdeca917ca
--- /dev/null
+++ b/Documentation/mm/numa.rst
@@ -0,0 +1,150 @@
+.. _numa:
+
+Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
+
+=============
+What is NUMA?
+=============
+
+This question can be answered from a couple of perspectives:  the
+hardware view and the Linux software view.
+
+From the hardware perspective, a NUMA system is a computer platform that
+comprises multiple components or assemblies each of which may contain 0
+or more CPUs, local memory, and/or IO buses.  For brevity and to
+disambiguate the hardware view of these physical components/assemblies
+from the software abstraction thereof, we'll call the components/assemblies
+'cells' in this document.
+
+Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
+of the system--although some components necessary for a stand-alone SMP system
+may not be populated on any given cell.   The cells of the NUMA system are
+connected together with some sort of system interconnect--e.g., a crossbar or
+point-to-point link are common types of NUMA system interconnects.  Both of
+these types of interconnects can be aggregated to create NUMA platforms with
+cells at multiple distances from other cells.
+
+For Linux, the NUMA platforms of interest are primarily what is known as Cache
+Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
+to and accessible from any CPU attached to any cell and cache coherency
+is handled in hardware by the processor caches and/or the system interconnect.
+
+Memory access time and effective memory bandwidth varies depending on how far
+away the cell containing the CPU or IO bus making the memory access is from the
+cell containing the target memory.  For example, access to memory by CPUs
+attached to the same cell will experience faster access times and higher
+bandwidths than accesses to memory on other, remote cells.  NUMA platforms
+can have cells at multiple remote distances from any given cell.
+
+Platform vendors don't build NUMA systems just to make software developers'
+lives interesting.  Rather, this architecture is a means to provide scalable
+memory bandwidth.  However, to achieve scalable memory bandwidth, system and
+application software must arrange for a large majority of the memory references
+[cache misses] to be to "local" memory--memory on the same cell, if any--or
+to the closest cell with memory.
+
+This leads to the Linux software view of a NUMA system:
+
+Linux divides the system's hardware resources into multiple software
+abstractions called "nodes".  Linux maps the nodes onto the physical cells
+of the hardware platform, abstracting away some of the details for some
+architectures.  As with physical cells, software nodes may contain 0 or more
+CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
+"closer" nodes--nodes that map to closer cells--will generally experience
+faster access times and higher effective bandwidth than accesses to more
+remote cells.
+
+For some architectures, such as x86, Linux will "hide" any node representing a
+physical cell that has no memory attached, and reassign any CPUs attached to
+that cell to a node representing a cell that does have memory.  Thus, on
+these architectures, one cannot assume that all CPUs that Linux associates with
+a given node will see the same local memory access times and bandwidth.
+
+In addition, for some architectures, again x86 is an example, Linux supports
+the emulation of additional nodes.  For NUMA emulation, linux will carve up
+the existing nodes--or the system memory for non-NUMA platforms--into multiple
+nodes.  Each emulated node will manage a fraction of the underlying cells'
+physical memory.  NUMA emluation is useful for testing NUMA kernel and
+application features on non-NUMA platforms, and as a sort of memory resource
+management mechanism when used together with cpusets.
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+For each node with memory, Linux constructs an independent memory management
+subsystem, complete with its own free page lists, in-use page lists, usage
+statistics and locks to mediate access.  In addition, Linux constructs for
+each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
+an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
+selected zone/node cannot satisfy the allocation request.  This situation,
+when a zone has no available memory to satisfy a request, is called
+"overflow" or "fallback".
+
+Because some nodes contain multiple zones containing different types of
+memory, Linux must decide whether to order the zonelists such that allocations
+fall back to the same zone type on a different node, or to a different zone
+type on the same node.  This is an important consideration because some zones,
+such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
+a default Node ordered zonelist. This means it tries to fallback to other zones
+from the same node before using remote nodes which are ordered by NUMA distance.
+
+By default, Linux will attempt to satisfy memory allocation requests from the
+node to which the CPU that executes the request is assigned.  Specifically,
+Linux will attempt to allocate from the first node in the appropriate zonelist
+for the node where the request originates.  This is called "local allocation."
+If the "local" node cannot satisfy the request, the kernel will examine other
+nodes' zones in the selected zonelist looking for the first zone in the list
+that can satisfy the request.
+
+Local allocation will tend to keep subsequent access to the allocated memory
+"local" to the underlying physical resources and off the system interconnect--
+as long as the task on whose behalf the kernel allocated some memory does not
+later migrate away from that memory.  The Linux scheduler is aware of the
+NUMA topology of the platform--embodied in the "scheduling domains" data
+structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler
+attempts to minimize task migration to distant scheduling domains.  However,
+the scheduler does not take a task's NUMA footprint into account directly.
+Thus, under sufficient imbalance, tasks can migrate between nodes, remote
+from their initial node and kernel data structures.
+
+System administrators and application designers can restrict a task's migration
+to improve NUMA locality using various CPU affinity command line interfaces,
+such as taskset(1) and numactl(1), and program interfaces such as
+sched_setaffinity(2).  Further, one can modify the kernel's default local
+allocation behavior using Linux NUMA memory policy. [see
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
+
+System administrators can restrict the CPUs and nodes' memories that a non-
+privileged user can specify in the scheduling or NUMA commands and functions
+using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+On architectures that do not hide memoryless nodes, Linux will include only
+zones [nodes] with memory in the zonelists.  This means that for a memoryless
+node the "local memory node"--the node of the first zone in CPU's node's
+zonelist--will not be the node itself.  Rather, it will be the node that the
+kernel selected as the nearest node with memory when it built the zonelists.
+So, default, local allocations will succeed with the kernel supplying the
+closest available memory.  This is a consequence of the same mechanism that
+allows such allocations to fallback to other nearby nodes when a node that
+does contain memory overflows.
+
+Some kernel allocations do not want or cannot tolerate this allocation fallback
+behavior.  Rather they want to be sure they get memory from the specified node
+or get notified that the node has no free memory.  This is usually the case when
+a subsystem allocates per CPU memory resources, for example.
+
+A typical model for making such an allocation is to obtain the node id of the
+node to which the "current CPU" is attached using one of the kernel's
+numa_node_id() or CPU_to_node() functions and then request memory from only
+the node id returned.  When such an allocation fails, the requesting subsystem
+may revert to its own fallback path.  The slab kernel memory allocator is an
+example of this.  Or, the subsystem may choose to disable or not to enable
+itself on allocation failure.  The kernel profiling subsystem is an example of
+this.
+
+If the architecture supports--does not hide--memoryless nodes, then CPUs
+attached to memoryless nodes would always incur the fallback path overhead
+or some subsystems would fail to initialize if they attempted to allocated
+memory exclusively from a node without memory.  To support such
+architectures transparently, kernel subsystems can use the numa_mem_id()
+or cpu_to_mem() function to locate the "local memory node" for the calling or
+specified CPU.  Again, this is the same node from which default, local page
+allocations will be attempted.
diff --git a/Documentation/mm/oom.rst b/Documentation/mm/oom.rst
new file mode 100644
index 000000000000..18e9e40c1ec1
--- /dev/null
+++ b/Documentation/mm/oom.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+Out Of Memory Handling
+======================
diff --git a/Documentation/mm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst
new file mode 100644
index 000000000000..1addb0c374a4
--- /dev/null
+++ b/Documentation/mm/overcommit-accounting.rst
@@ -0,0 +1,88 @@
+.. _overcommit_accounting:
+
+=====================
+Overcommit Accounting
+=====================
+
+The Linux kernel supports the following overcommit handling modes
+
+0
+	Heuristic overcommit handling. Obvious overcommits of address
+	space are refused. Used for a typical system. It ensures a
+	seriously wild allocation fails while allowing overcommit to
+	reduce swap usage.  root is allowed to allocate slightly more
+	memory in this mode. This is the default.
+
+1
+	Always overcommit. Appropriate for some scientific
+	applications. Classic example is code using sparse arrays and
+	just relying on the virtual memory consisting almost entirely
+	of zero pages.
+
+2
+	Don't overcommit. The total address space commit for the
+	system is not permitted to exceed swap + a configurable amount
+	(default is 50%) of physical RAM.  Depending on the amount you
+	use, in most situations this means a process will not be
+	killed while accessing pages but will receive errors on memory
+	allocation as appropriate.
+
+	Useful for applications that want to guarantee their memory
+	allocations will be available in the future without having to
+	initialize every page.
+
+The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
+
+The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
+or ``vm.overcommit_kbytes`` (absolute value). These only have an effect
+when ``vm.overcommit_memory`` is set to 2.
+
+The current overcommit limit and amount committed are viewable in
+``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
+
+Gotchas
+=======
+
+The C language stack growth does an implicit mremap. If you want absolute
+guarantees and run close to the edge you MUST mmap your stack for the
+largest size you think you will need. For typical stack usage this does
+not matter much but it's a corner case if you really really care
+
+In mode 2 the MAP_NORESERVE flag is ignored.
+
+
+How It Works
+============
+
+The overcommit is based on the following rules
+
+For a file backed map
+	| SHARED or READ-only	-	0 cost (the file is the map not swap)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+For an anonymous or ``/dev/zero`` map
+	| SHARED			-	size of mapping
+	| PRIVATE READ-only	-	0 cost (but of little use)
+	| PRIVATE WRITABLE	-	size of mapping per instance
+
+Additional accounting
+	| Pages made writable copies by mmap
+	| shmfs memory drawn from the same pool
+
+Status
+======
+
+*	We account mmap memory mappings
+*	We account mprotect changes in commit
+*	We account mremap changes in size
+*	We account brk
+*	We account munmap
+*	We report the commit status in /proc
+*	Account and check on fork
+*	Review stack handling/building on exec
+*	SHMfs accounting
+*	Implement actual limit enforcement
+
+To Do
+=====
+*	Account ptrace pages (this is hard)
diff --git a/Documentation/mm/page_allocation.rst b/Documentation/mm/page_allocation.rst
new file mode 100644
index 000000000000..d9b4495561f1
--- /dev/null
+++ b/Documentation/mm/page_allocation.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Page Allocation
+===============
diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst
new file mode 100644
index 000000000000..75eba7c431b2
--- /dev/null
+++ b/Documentation/mm/page_cache.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Page Cache
+==========
diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst
new file mode 100644
index 000000000000..7d6f9385d129
--- /dev/null
+++ b/Documentation/mm/page_frags.rst
@@ -0,0 +1,45 @@
+.. _page_frags:
+
+==============
+Page fragments
+==============
+
+A page fragment is an arbitrary-length arbitrary-offset area of memory
+which resides within a 0 or higher order compound page.  Multiple
+fragments within that page are individually refcounted, in the page's
+reference counter.
+
+The page_frag functions, page_frag_alloc and page_frag_free, provide a
+simple allocation framework for page fragments.  This is used by the
+network stack and network device drivers to provide a backing region of
+memory for use as either an sk_buff->head, or to be used in the "frags"
+portion of skb_shared_info.
+
+In order to make use of the page fragment APIs a backing page fragment
+cache is needed.  This provides a central point for the fragment allocation
+and tracks allows multiple calls to make use of a cached page.  The
+advantage to doing this is that multiple calls to get_page can be avoided
+which can be expensive at allocation time.  However due to the nature of
+this caching it is required that any calls to the cache be protected by
+either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
+to be disabled when executing the fragment allocation.
+
+The network stack uses two separate caches per CPU to handle fragment
+allocation.  The netdev_alloc_cache is used by callers making use of the
+netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
+used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+main difference between these two calls is the context in which they may be
+called.  The "netdev" prefixed functions are usable in any context as these
+functions will disable interrupts, while the "napi" prefixed functions are
+only usable within the softirq context.
+
+Many network device drivers use a similar methodology for allocating page
+fragments, but the page fragments are cached at the ring or descriptor
+level.  In order to enable these cases it is necessary to provide a generic
+way of tearing down a page cache.  For this reason __page_frag_cache_drain
+was implemented.  It allows for freeing multiple references from a single
+page via a single call.  The advantage to doing this is that it allows for
+cleaning up the multiple references that were added to a page in order to
+avoid calling get_page per allocation.
+
+Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst
new file mode 100644
index 000000000000..8c5cb8147e55
--- /dev/null
+++ b/Documentation/mm/page_migration.rst
@@ -0,0 +1,288 @@
+.. _page_migration:
+
+==============
+Page migration
+==============
+
+Page migration allows moving the physical location of pages between
+nodes in a NUMA system while the process is running. This means that the
+virtual addresses that the process sees do not change. However, the
+system rearranges the physical location of those pages.
+
+Also see :ref:`Heterogeneous Memory Management (HMM) <hmm>`
+for migrating pages to or from device private memory.
+
+The main intent of page migration is to reduce the latency of memory accesses
+by moving pages near to the processor where the process accessing that memory
+is running.
+
+Page migration allows a process to manually relocate the node on which its
+pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
+a new memory policy via mbind(). The pages of a process can also be relocated
+from another process using the sys_migrate_pages() function call. The
+migrate_pages() function call takes two sets of nodes and moves pages of a
+process that are located on the from nodes to the destination nodes.
+Page migration functions are provided by the numactl package by Andi Kleen
+(a version later than 0.9.3 is required. Get it from
+https://github.com/numactl/numactl.git). numactl provides libnuma
+which provides an interface similar to other NUMA functionality for page
+migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
+pages of a process are located. See also the numa_maps documentation in the
+proc(5) man page.
+
+Manual migration is useful if for example the scheduler has relocated
+a process to a processor on a distant node. A batch scheduler or an
+administrator may detect the situation and move the pages of the process
+nearer to the new processor. The kernel itself only provides
+manual page migration support. Automatic page migration may be implemented
+through user space processes that move pages. A special function call
+"move_pages" allows the moving of individual pages within a process.
+For example, A NUMA profiler may obtain a log showing frequent off-node
+accesses and may use the result to move pages to more advantageous
+locations.
+
+Larger installations usually partition the system using cpusets into
+sections of nodes. Paul Jackson has equipped cpusets with the ability to
+move pages when a task is moved to another cpuset (See
+:ref:`CPUSETS <cpusets>`).
+Cpusets allow the automation of process locality. If a task is moved to
+a new cpuset then also all its pages are moved with it so that the
+performance of the process does not sink dramatically. Also the pages
+of processes in a cpuset are moved if the allowed memory nodes of a
+cpuset are changed.
+
+Page migration allows the preservation of the relative location of pages
+within a group of nodes for all migration techniques which will preserve a
+particular memory allocation pattern generated even after migrating a
+process. This is necessary in order to preserve the memory latencies.
+Processes will run with similar performance after migration.
+
+Page migration occurs in several steps. First a high level
+description for those trying to use migrate_pages() from the kernel
+(for userspace usage see the Andi Kleen's numactl package mentioned above)
+and then a low level description of how the low level details work.
+
+In kernel use of migrate_pages()
+================================
+
+1. Remove pages from the LRU.
+
+   Lists of pages to be migrated are generated by scanning over
+   pages and moving them into lists. This is done by
+   calling isolate_lru_page().
+   Calling isolate_lru_page() increases the references to the page
+   so that it cannot vanish while the page migration occurs.
+   It also prevents the swapper or other scans from encountering
+   the page.
+
+2. We need to have a function of type new_page_t that can be
+   passed to migrate_pages(). This function should figure out
+   how to allocate the correct new page given the old page.
+
+3. The migrate_pages() function is called which attempts
+   to do the migration. It will call the function to allocate
+   the new page for each page that is considered for
+   moving.
+
+How migrate_pages() works
+=========================
+
+migrate_pages() does several passes over its list of pages. A page is moved
+if all references to a page are removable at the time. The page has
+already been removed from the LRU via isolate_lru_page() and the refcount
+is increased so that the page cannot be freed while page migration occurs.
+
+Steps:
+
+1. Lock the page to be migrated.
+
+2. Ensure that writeback is complete.
+
+3. Lock the new page that we want to move to. It is locked so that accesses to
+   this (not yet up-to-date) page immediately block while the move is in progress.
+
+4. All the page table references to the page are converted to migration
+   entries. This decreases the mapcount of a page. If the resulting
+   mapcount is not zero then we do not migrate the page. All user space
+   processes that attempt to access the page will now wait on the page lock
+   or wait for the migration page table entry to be removed.
+
+5. The i_pages lock is taken. This will cause all processes trying
+   to access the page via the mapping to block on the spinlock.
+
+6. The refcount of the page is examined and we back out if references remain.
+   Otherwise, we know that we are the only one referencing this page.
+
+7. The radix tree is checked and if it does not contain the pointer to this
+   page then we back out because someone else modified the radix tree.
+
+8. The new page is prepped with some settings from the old page so that
+   accesses to the new page will discover a page with the correct settings.
+
+9. The radix tree is changed to point to the new page.
+
+10. The reference count of the old page is dropped because the address space
+    reference is gone. A reference to the new page is established because
+    the new page is referenced by the address space.
+
+11. The i_pages lock is dropped. With that lookups in the mapping
+    become possible again. Processes will move from spinning on the lock
+    to sleeping on the locked new page.
+
+12. The page contents are copied to the new page.
+
+13. The remaining page flags are copied to the new page.
+
+14. The old page flags are cleared to indicate that the page does
+    not provide any information anymore.
+
+15. Queued up writeback on the new page is triggered.
+
+16. If migration entries were inserted into the page table, then replace them
+    with real ptes. Doing so will enable access for user space processes not
+    already waiting for the page lock.
+
+17. The page locks are dropped from the old and new page.
+    Processes waiting on the page lock will redo their page faults
+    and will reach the new page.
+
+18. The new page is moved to the LRU and can be scanned by the swapper,
+    etc. again.
+
+Non-LRU page migration
+======================
+
+Although migration originally aimed for reducing the latency of memory accesses
+for NUMA, compaction also uses migration to create high-order pages.
+
+Current problem of the implementation is that it is designed to migrate only
+*LRU* pages. However, there are potential non-LRU pages which can be migrated
+in drivers, for example, zsmalloc, virtio-balloon pages.
+
+For virtio-balloon pages, some parts of migration code path have been hooked
+up and added virtio-balloon specific functions to intercept migration logics.
+It's too specific to a driver so other drivers who want to make their pages
+movable would have to add their own specific hooks in the migration path.
+
+To overcome the problem, VM supports non-LRU page migration which provides
+generic functions for non-LRU movable pages without driver specific hooks
+in the migration path.
+
+If a driver wants to make its pages movable, it should define three functions
+which are function pointers of struct address_space_operations.
+
+1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
+
+   What VM expects from isolate_page() function of driver is to return *true*
+   if driver isolates the page successfully. On returning true, VM marks the page
+   as PG_isolated so concurrent isolation in several CPUs skip the page
+   for isolation. If a driver cannot isolate the page, it should return *false*.
+
+   Once page is successfully isolated, VM uses page.lru fields so driver
+   shouldn't expect to preserve values in those fields.
+
+2. ``int (*migratepage) (struct address_space *mapping,``
+|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
+
+   After isolation, VM calls migratepage() of driver with the isolated page.
+   The function of migratepage() is to move the contents of the old page to the
+   new page
+   and set up fields of struct page newpage. Keep in mind that you should
+   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
+   under page_lock if you migrated the oldpage successfully and returned
+   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
+   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
+   because VM interprets -EAGAIN as "temporary migration failure". On returning
+   any error except -EAGAIN, VM will give up the page migration without
+   retrying.
+
+   Driver shouldn't touch the page.lru field while in the migratepage() function.
+
+3. ``void (*putback_page)(struct page *);``
+
+   If migration fails on the isolated page, VM should return the isolated page
+   to the driver so VM calls the driver's putback_page() with the isolated page.
+   In this function, the driver should put the isolated page back into its own data
+   structure.
+
+Non-LRU movable page flags
+
+   There are two page flags for supporting non-LRU movable page.
+
+   * PG_movable
+
+     Driver should use the function below to make page movable under page_lock::
+
+	void __SetPageMovable(struct page *page, struct address_space *mapping)
+
+     It needs argument of address_space for registering migration
+     family functions which will be called by VM. Exactly speaking,
+     PG_movable is not a real flag of struct page. Rather, VM
+     reuses the page->mapping's lower bits to represent it::
+
+	#define PAGE_MAPPING_MOVABLE 0x2
+	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
+
+     so driver shouldn't access page->mapping directly. Instead, driver should
+     use page_mapping() which masks off the low two bits of page->mapping under
+     page lock so it can get the right struct address_space.
+
+     For testing of non-LRU movable pages, VM supports __PageMovable() function.
+     However, it doesn't guarantee to identify non-LRU movable pages because
+     the page->mapping field is unified with other variables in struct page.
+     If the driver releases the page after isolation by VM, page->mapping
+     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
+     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
+     page is LRU or non-LRU movable once the page has been isolated because LRU
+     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
+     good for just peeking to test non-LRU movable pages before more expensive
+     checking with lock_page() in pfn scanning to select a victim.
+
+     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
+     Unlike __PageMovable(), PageMovable() validates page->mapping and
+     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
+     sudden destroying of page->mapping.
+
+     Drivers using __SetPageMovable() should clear the flag via
+     __ClearMovablePage() under page_lock() before the releasing the page.
+
+   * PG_isolated
+
+     To prevent concurrent isolation among several CPUs, VM marks isolated page
+     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
+     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
+     flag because VM will set/clear it automatically. Keep in mind that if the
+     driver sees a PG_isolated page, it means the page has been isolated by the
+     VM so it shouldn't touch the page.lru field.
+     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
+     shouldn't use PG_isolated for its own purposes.
+
+Monitoring Migration
+=====================
+
+The following events (counters) can be used to monitor page migration.
+
+1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
+   page was migrated. If the page was a non-THP and non-hugetlb page, then
+   this counter is increased by one. If the page was a THP or hugetlb, then
+   this counter is increased by the number of THP or hugetlb subpages.
+   For example, migration of a single 2MB THP that has 4KB-size base pages
+   (subpages) will cause this counter to increase by 512.
+
+2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
+   PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
+   if it was a THP or hugetlb.
+
+3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
+
+4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split.
+
+5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had
+   to be split. After splitting, a migration retry was used for it's sub-pages.
+
+THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or
+PGMIGRATE_FAIL events. For example, a THP migration failure will cause both
+THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
+
+Christoph Lameter, May 8, 2006.
+Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
new file mode 100644
index 000000000000..f5c954afe97c
--- /dev/null
+++ b/Documentation/mm/page_owner.rst
@@ -0,0 +1,196 @@
+.. _page_owner:
+
+==================================================
+page owner: Tracking about who allocated each page
+==================================================
+
+Introduction
+============
+
+page owner is for the tracking about who allocated each page.
+It can be used to debug memory leak or to find a memory hogger.
+When allocation happens, information about allocation such as call stack
+and order of pages is stored into certain storage for each page.
+When we need to know about status of all pages, we can get and analyze
+this information.
+
+Although we already have tracepoint for tracing page allocation/free,
+using it for analyzing who allocate each page is rather complex. We need
+to enlarge the trace buffer for preventing overlapping until userspace
+program launched. And, launched program continually dump out the trace
+buffer for later analysis and it would change system behaviour with more
+possibility rather than just keeping it in memory, so bad for debugging.
+
+page owner can also be used for various purposes. For example, accurate
+fragmentation statistics can be obtained through gfp flag information of
+each page. It is already implemented and activated if page owner is
+enabled. Other usages are more than welcome.
+
+page owner is disabled by default. So, if you'd like to use it, you need
+to add "page_owner=on" to your boot cmdline. If the kernel is built
+with page owner and page owner is disabled in runtime due to not enabling
+boot option, runtime overhead is marginal. If disabled in runtime, it
+doesn't require memory to store owner information, so there is no runtime
+memory overhead. And, page owner inserts just two unlikely branches into
+the page allocator hotpath and if not enabled, then allocation is done
+like as the kernel without page owner. These two unlikely branches should
+not affect to allocation performance, especially if the static keys jump
+label patching functionality is available. Following is the kernel's code
+size change due to this facility.
+
+- Without page owner::
+
+   text    data     bss     dec     hex filename
+   48392   2333     644   51369    c8a9 mm/page_alloc.o
+
+- With page owner::
+
+   text    data     bss     dec     hex filename
+   48800   2445     644   51889    cab1 mm/page_alloc.o
+   6662     108      29    6799    1a8f mm/page_owner.o
+   1025       8       8    1041     411 mm/page_ext.o
+
+Although, roughly, 8 KB code is added in total, page_alloc.o increase by
+520 bytes and less than half of it is in hotpath. Building the kernel with
+page owner and turning it on if needed would be great option to debug
+kernel memory problem.
+
+There is one notice that is caused by implementation detail. page owner
+stores information into the memory from struct page extension. This memory
+is initialized some time later than that page allocator starts in sparse
+memory system, so, until initialization, many pages can be allocated and
+they would have no owner information. To fix it up, these early allocated
+pages are investigated and marked as allocated in initialization phase.
+Although it doesn't mean that they have the right owner information,
+at least, we can tell whether the page is allocated or not,
+more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
+are catched and marked, although they are mostly allocated from struct
+page extension feature. Anyway, after that, no page is left in
+un-tracking state.
+
+Usage
+=====
+
+1) Build user-space helper::
+
+	cd tools/vm
+	make page_owner_sort
+
+2) Enable page owner: add "page_owner=on" to boot cmdline.
+
+3) Do the job that you want to debug.
+
+4) Analyze information from page owner::
+
+	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+
+   The general output of ``page_owner_full.txt`` is as follows::
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+   The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
+   in buf, uses regexp to extract the page order value, counts the times
+   and pages of buf, and finally sorts them according to the parameter(s).
+
+   See the result about who allocated each page
+   in the ``sorted_page_owner.txt``. General output::
+
+	XXX times, XXX pages:
+	Page allocated via order XXX, ...
+	// Detailed stack
+
+   By default, ``page_owner_sort`` is sorted according to the times of buf.
+   If you want to sort by the page nums of buf, use the ``-m`` parameter.
+   The detailed parameters are:
+
+   fundamental function::
+
+	Sort:
+		-a		Sort by memory allocation time.
+		-m		Sort by total memory.
+		-p		Sort by pid.
+		-P		Sort by tgid.
+		-n		Sort by task command name.
+		-r		Sort by memory release time.
+		-s		Sort by stack trace.
+		-t		Sort by times (default).
+		--sort <order>	Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].
+				Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is
+				optional since default direction is increasing numerical or lexicographic
+				order. Mixed use of abbreviated and complete-form of keys is allowed.
+
+		Examples:
+				./page_owner_sort <input> <output> --sort=n,+pid,-tgid
+				./page_owner_sort <input> <output> --sort=at
+
+   additional function::
+
+	Cull:
+		--cull <rules>
+				Specify culling rules.Culling syntax is key[,key[,...]].Choose a
+				multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
+
+		<rules> is a single argument in the form of a comma-separated list,
+		which offers a way to specify individual culling rules.  The recognized
+		keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
+		<rules> can be specified by the sequence of keys k1,k2, ..., as described in
+		the STANDARD SORT KEYS section below. Mixed use of abbreviated and
+		complete-form of keys is allowed.
+
+		Examples:
+				./page_owner_sort <input> <output> --cull=stacktrace
+				./page_owner_sort <input> <output> --cull=st,pid,name
+				./page_owner_sort <input> <output> --cull=n,f
+
+	Filter:
+		-f		Filter out the information of blocks whose memory has been released.
+
+	Select:
+		--pid <pidlist>		Select by pid. This selects the blocks whose process ID
+					numbers appear in <pidlist>.
+		--tgid <tgidlist>	Select by tgid. This selects the blocks whose thread
+					group ID numbers appear in <tgidlist>.
+		--name <cmdlist>	Select by task command name. This selects the blocks whose
+					task command name appear in <cmdlist>.
+
+		<pidlist>, <tgidlist>, <cmdlist> are single arguments in the form of a comma-separated list,
+		which offers a way to specify individual selecting rules.
+
+
+		Examples:
+				./page_owner_sort <input> <output> --pid=1
+				./page_owner_sort <input> <output> --tgid=1,2,3
+				./page_owner_sort <input> <output> --name name1,name2
+
+STANDARD FORMAT SPECIFIERS
+==========================
+::
+
+  For --sort option:
+
+	KEY		LONG		DESCRIPTION
+	p		pid		process ID
+	tg		tgid		thread group ID
+	n		name		task command name
+	st		stacktrace	stack trace of the page allocation
+	T		txt		full text of block
+	ft		free_ts		timestamp of the page when it was released
+	at		alloc_ts	timestamp of the page when it was allocated
+	ator		allocator	memory allocator for pages
+
+  For --curl option:
+
+	KEY		LONG		DESCRIPTION
+	p		pid		process ID
+	tg		tgid		thread group ID
+	n		name		task command name
+	f		free		whether the page has been released or not
+	st		stacktrace	stack trace of the page allocation
+	ator		allocator	memory allocator for pages
diff --git a/Documentation/mm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst
new file mode 100644
index 000000000000..50a30b7f8ac3
--- /dev/null
+++ b/Documentation/mm/page_reclaim.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Page Reclaim
+============
diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst
new file mode 100644
index 000000000000..1a09472f10a3
--- /dev/null
+++ b/Documentation/mm/page_table_check.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _page_table_check:
+
+================
+Page Table Check
+================
+
+Introduction
+============
+
+Page table check allows to harden the kernel by ensuring that some types of
+the memory corruptions are prevented.
+
+Page table check performs extra verifications at the time when new pages become
+accessible from the userspace by getting their page table entries (PTEs PMDs
+etc.) added into the table.
+
+In case of detected corruption, the kernel is crashed. There is a small
+performance and memory overhead associated with the page table check. Therefore,
+it is disabled by default, but can be optionally enabled on systems where the
+extra hardening outweighs the performance costs. Also, because page table check
+is synchronous, it can help with debugging double map memory corruption issues,
+by crashing kernel at the time wrong mapping occurs instead of later which is
+often the case with memory corruptions bugs.
+
+Double mapping detection logic
+==============================
+
++-------------------+-------------------+-------------------+------------------+
+| Current Mapping   | New mapping       | Permissions       | Rule             |
++===================+===================+===================+==================+
+| Anonymous         | Anonymous         | Read              | Allow            |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Named             | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Anonymous         | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Named             | Any               | Allow            |
++-------------------+-------------------+-------------------+------------------+
+
+Enabling Page Table Check
+=========================
+
+Build kernel with:
+
+- PAGE_TABLE_CHECK=y
+  Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
+  is available.
+
+- Boot with 'page_table_check=on' kernel parameter.
+
+Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
+table support without extra kernel parameter.
diff --git a/Documentation/mm/page_tables.rst b/Documentation/mm/page_tables.rst
new file mode 100644
index 000000000000..96939571d7bc
--- /dev/null
+++ b/Documentation/mm/page_tables.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Page Tables
+===========
diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst
new file mode 100644
index 000000000000..2ab7b8c1c863
--- /dev/null
+++ b/Documentation/mm/physical_memory.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Physical Memory
+===============
diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst
new file mode 100644
index 000000000000..e8618fbc62c9
--- /dev/null
+++ b/Documentation/mm/process_addrs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Process Addresses
+=================
diff --git a/Documentation/mm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst
new file mode 100644
index 000000000000..7bef6718e3a9
--- /dev/null
+++ b/Documentation/mm/remap_file_pages.rst
@@ -0,0 +1,33 @@
+.. _remap_file_pages:
+
+==============================
+remap_file_pages() system call
+==============================
+
+The remap_file_pages() system call is used to create a nonlinear mapping,
+that is, a mapping in which the pages of the file are mapped into a
+nonsequential order in memory. The advantage of using remap_file_pages()
+over using repeated calls to mmap(2) is that the former approach does not
+require the kernel to create additional VMA (Virtual Memory Area) data
+structures.
+
+Supporting of nonlinear mapping requires significant amount of non-trivial
+code in kernel virtual memory subsystem including hot paths. Also to get
+nonlinear mapping work kernel need a way to distinguish normal page table
+entries from entries with file offset (pte_file). Kernel reserves flag in
+PTE for this purpose. PTE flags are scarce resource especially on some CPU
+architectures. It would be nice to free up the flag for other usage.
+
+Fortunately, there are not many users of remap_file_pages() in the wild.
+It's only known that one enterprise RDBMS implementation uses the syscall
+on 32-bit systems to map files bigger than can linearly fit into 32-bit
+virtual address space. This use-case is not critical anymore since 64-bit
+systems are widely available.
+
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
+
+One side effect of emulation (apart from performance) is that user can hit
+vm.max_map_count limit more easily due to additional VMAs. See comment for
+DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/mm/shmfs.rst b/Documentation/mm/shmfs.rst
new file mode 100644
index 000000000000..8b01ebb4c30e
--- /dev/null
+++ b/Documentation/mm/shmfs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Shared Memory Filesystem
+========================
diff --git a/Documentation/mm/slab.rst b/Documentation/mm/slab.rst
new file mode 100644
index 000000000000..87d5a5bb172f
--- /dev/null
+++ b/Documentation/mm/slab.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Slab Allocation
+===============
diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst
new file mode 100644
index 000000000000..43063ade737a
--- /dev/null
+++ b/Documentation/mm/slub.rst
@@ -0,0 +1,452 @@
+.. _slub:
+
+==========================
+Short users guide for SLUB
+==========================
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+slab caches. SLUB always includes full debugging but it is off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add an option ``slub_debug``
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the ``slabinfo`` command to get statistical
+data and perform operation on the slabs. By default ``slabinfo`` only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. ``slabinfo`` can be compiled with
+::
+
+	gcc -o slabinfo tools/vm/slabinfo.c
+
+Some of the modes of operation of ``slabinfo`` require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to ``slub_debug``. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options>
+	Enable options for all slabs
+
+slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
+	Enable options only for select slabs (no spaces
+	after a comma)
+
+Multiple blocks of options for all slabs or selected slabs can be given, with
+blocks of options delimited by ';'. The last of "all slabs" blocks is applied
+to all slabs except those that match one of the "select slabs" block. Options
+of the first "select slabs" blocks that matches the slab's name are applied.
+
+Possible debug options are::
+
+	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
+			Sorry SLAB legacy issues)
+	Z		Red zoning
+	P		Poisoning (object and padding)
+	U		User tracking (free and alloc)
+	T		Trace (please only use on single slabs)
+	A		Enable failslab filter mark for the cache
+	O		Switch debugging off for caches that would have
+			caused higher minimum slab orders
+	-		Switch all debugging off (useful if the kernel is
+			configured with CONFIG_SLUB_DEBUG_ON)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify::
+
+	slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try::
+
+	slub_debug=,dentry
+
+to only enable debugging on the dentry cache.  You may use an asterisk at the
+end of the slab name, in order to cover all slabs with the same prefix.  For
+example, here's how you can poison the dentry cache as well as all kmalloc
+slabs::
+
+	slub_debug=P,kmalloc-*,dentry
+
+Red zoning and tracking may realign the slab.  We can just apply sanity checks
+to the dentry cache with::
+
+	slub_debug=F,dentry
+
+Debugging options may require the minimum possible slab order to increase as
+a result of storing the metadata (for example, caches with PAGE_SIZE object
+sizes).  This has a higher liklihood of resulting in slab allocation errors
+in low memory situations or if there's high fragmentation of memory.  To
+switch off debugging for such caches by default, use::
+
+	slub_debug=O
+
+You can apply different options to different list of slab names, using blocks
+of options. This will enable red zoning for dentry and user tracking for
+kmalloc. All other slabs will not get any debugging enabled::
+
+	slub_debug=Z,dentry;U,kmalloc-*
+
+You can also enable options (e.g. sanity checks and poisoning) for all caches
+except some that are deemed too performance critical and don't need to be
+debugged by specifying global debug options followed by a list of slab names
+with "-" as options::
+
+	slub_debug=FZ;-,zs_handle,zspage
+
+The state of each debug option for a slab can be found in the respective files
+under::
+
+	/sys/kernel/slab/<slab name>/
+
+If the file contains 1, the option is enabled, 0 means disabled. The debug
+options from the ``slub_debug`` parameter translate to the following files::
+
+	F	sanity_checks
+	Z	red_zone
+	P	poison
+	U	store_user
+	T	trace
+	A	failslab
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+Slab merging
+============
+
+If no debug options are specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+``slabinfo -a`` displays which slabs were merged together.
+
+Slab validation
+===============
+
+SLUB can validate all object if the kernel was booted with slub_debug. In
+order to do so you must have the ``slabinfo`` tool. Then you can do
+::
+
+	slabinfo -v
+
+which will test all objects. Output will be generated to the syslog.
+
+This also works in a more limited way if boot was without slab debug.
+In that case ``slabinfo -v`` simply tests all reachable objects. Usually
+these are in the cpu slabs and the partial slabs. Full slabs are not
+tracked by SLUB in a non debug situation.
+
+Getting more performance
+========================
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+.. slub_min_objects=x		(default 4)
+.. slub_min_order=x		(default 0)
+.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
+
+``slub_min_objects``
+	allows to specify how many objects must at least fit into one
+	slab in order for the allocation order to be acceptable.  In
+	general slub will be able to perform this number of
+	allocations on a slab without consulting centralized resources
+	(list_lock) where contention may occur.
+
+``slub_min_order``
+	specifies a minimum order of slabs. A similar effect like
+	``slub_min_objects``.
+
+``slub_max_order``
+	specified the order at which ``slub_min_objects`` should no
+	longer be checked. This is useful to avoid SLUB trying to
+	generate super large order pages to fit ``slub_min_objects``
+	of a slab cache with large object sizes into one high order
+	page. Setting command line parameter
+	``debug_guardpage_minorder=N`` (N > 0), forces setting
+	``slub_max_order`` to 0, what cause minimum possible order of
+	slabs allocation.
+
+SLUB Debug output
+=================
+
+Here is a sample of slub debug output::
+
+ ====================================================================
+ BUG kmalloc-8: Right Redzone overwritten
+ --------------------------------------------------------------------
+
+ INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
+ INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
+ INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
+ INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
+
+ Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
+ Object   (0xc90f6d20): 31 30 31 39 2e 30 30 35                         1019.005
+ Redzone  (0xc90f6d28): 00 cc cc cc                                     .
+ Padding  (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
+
+   [<c010523d>] dump_trace+0x63/0x1eb
+   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
+   [<c010601d>] show_trace+0x12/0x14
+   [<c0106035>] dump_stack+0x16/0x18
+   [<c017e0fa>] object_err+0x143/0x14b
+   [<c017e2cc>] check_object+0x66/0x234
+   [<c017eb43>] __slab_free+0x239/0x384
+   [<c017f446>] kfree+0xa6/0xc6
+   [<c02e2335>] get_modalias+0xb9/0xf5
+   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
+   [<c027866a>] dev_uevent+0x1ad/0x1da
+   [<c0205024>] kobject_uevent_env+0x20a/0x45b
+   [<c020527f>] kobject_uevent+0xa/0xf
+   [<c02779f1>] store_uevent+0x4f/0x58
+   [<c027758e>] dev_attr_store+0x29/0x2f
+   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
+   [<c0183ba7>] vfs_write+0xd1/0x15a
+   [<c01841d7>] sys_write+0x3d/0x72
+   [<c0104112>] sysenter_past_esp+0x5f/0x99
+   [<b7f7b410>] 0xb7f7b410
+   =======================
+
+ FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
+
+If SLUB encounters a corrupted object (full detection requires the kernel
+to be booted with slub_debug) then the following output will be dumped
+into the syslog:
+
+1. Description of the problem encountered
+
+   This will be a message in the system log starting with::
+
+     ===============================================
+     BUG <slab cache affected>: <What went wrong>
+     -----------------------------------------------
+
+     INFO: <corruption start>-<corruption_end> <more info>
+     INFO: Slab <address> <slab information>
+     INFO: Object <address> <object information>
+     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
+	cpu> pid=<pid of the process>
+     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
+	pid=<pid of the process>
+
+   (Object allocation / free information is only available if SLAB_STORE_USER is
+   set for the slab. slub_debug sets that option)
+
+2. The object contents if an object was involved.
+
+   Various types of lines can follow the BUG SLUB line:
+
+   Bytes b4 <address> : <bytes>
+	Shows a few bytes before the object where the problem was detected.
+	Can be useful if the corruption does not stop with the start of the
+	object.
+
+   Object <address> : <bytes>
+	The bytes of the object. If the object is inactive then the bytes
+	typically contain poison values. Any non-poison value shows a
+	corruption by a write after free.
+
+   Redzone <address> : <bytes>
+	The Redzone following the object. The Redzone is used to detect
+	writes after the object. All bytes should always have the same
+	value. If there is any deviation then it is due to a write after
+	the object boundary.
+
+	(Redzone information is only available if SLAB_RED_ZONE is set.
+	slub_debug sets that option)
+
+   Padding <address> : <bytes>
+	Unused data to fill up the space in order to get the next object
+	properly aligned. In the debug case we make sure that there are
+	at least 4 bytes of padding. This allows the detection of writes
+	before the object.
+
+3. A stackdump
+
+   The stackdump describes the location where the error was detected. The cause
+   of the corruption is may be more likely found by looking at the function that
+   allocated or freed the object.
+
+4. Report on how the problem was dealt with in order to ensure the continued
+   operation of the system.
+
+   These are messages in the system log beginning with::
+
+	FIX <slab cache affected>: <corrective action taken>
+
+   In the above sample SLUB found that the Redzone of an active object has
+   been overwritten. Here a string of 8 characters was written into a slab that
+   has the length of 8 characters. However, a 8 character string needs a
+   terminating 0. That zero has overwritten the first byte of the Redzone field.
+   After reporting the details of the issue encountered the FIX SLUB message
+   tells us that SLUB has restored the Redzone to its proper value and then
+   system operations continue.
+
+Emergency operations
+====================
+
+Minimal debugging (sanity checks alone) can be enabled by booting with::
+
+	slub_debug=F
+
+This will be generally be enough to enable the resiliency features of slub
+which will keep the system running even if a bad kernel component will
+keep corrupting objects. This may be important for production systems.
+Performance will be impacted by the sanity checks and there will be a
+continual stream of error messages to the syslog but no additional memory
+will be used (unlike full debugging).
+
+No guarantees. The kernel component still needs to be fixed. Performance
+may be optimized further by locating the slab that experiences corruption
+and enabling debugging only for that cache
+
+I.e.::
+
+	slub_debug=F,dentry
+
+If the corruption occurs by writing after the end of the object then it
+may be advisable to enable a Redzone to avoid corrupting the beginning
+of other objects::
+
+	slub_debug=FZ,dentry
+
+Extended slabinfo mode and plotting
+===================================
+
+The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
+ - Slabcache Totals
+ - Slabs sorted by size (up to -N <num> slabs, default 1)
+ - Slabs sorted by loss (up to -N <num> slabs, default 1)
+
+Additionally, in this mode ``slabinfo`` does not dynamically scale
+sizes (G/M/K) and reports everything in bytes (this functionality is
+also available to other slabinfo modes via '-B' option) which makes
+reporting more precise and accurate. Moreover, in some sense the `-X'
+mode also simplifies the analysis of slabs' behaviour, because its
+output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
+pushes the analysis from looking through the numbers (tons of numbers)
+to something easier -- visual analysis.
+
+To generate plots:
+
+a) collect slabinfo extended records, for example::
+
+	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
+
+b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
+
+	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
+
+   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
+   and generates 3 png files (and 3 pre-processing cache files) per STATS
+   file:
+   - Slabcache Totals: FOO_STATS-totals.png
+   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
+   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
+
+Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
+need to compare slabs' behaviour "prior to" and "after" some code
+modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
+can 'merge' the `Slabcache Totals` sections from different
+measurements. To visually compare N plots:
+
+a) Collect as many STATS1, STATS2, .. STATSN files as you need::
+
+	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
+
+b) Pre-process those STATS files::
+
+	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
+
+c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
+   generated pre-processed \*-totals::
+
+	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
+
+   This will produce a single plot (png file).
+
+   Plots, expectedly, can be large so some fluctuations or small spikes
+   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
+   options to 'zoom-in'/'zoom-out':
+
+   a) ``-s %d,%d`` -- overwrites the default image width and height
+   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
+      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
+      40,60`` range will plot only samples collected between 40th and
+      60th seconds).
+
+
+DebugFS files for SLUB
+======================
+
+For more information about current state of SLUB caches with the user tracking
+debug option enabled, debugfs files are available, typically under
+/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
+tracking). There are 2 types of these files with the following debug
+information:
+
+1. alloc_traces::
+
+    Prints information about unique allocation traces of the currently
+    allocated objects. The output is sorted by frequency of each trace.
+
+    Information in the output:
+    Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
+    pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
+
+    Example:::
+
+    1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
+	__slab_alloc+0x6d/0x90
+	kmem_cache_alloc_trace+0x2eb/0x300
+	populate_error_injection_list+0x97/0x110
+	init_error_injection+0x1b/0x71
+	do_one_initcall+0x5f/0x2d0
+	kernel_init_freeable+0x26f/0x2d7
+	kernel_init+0xe/0x118
+	ret_from_fork+0x22/0x30
+
+
+2. free_traces::
+
+    Prints information about unique freeing traces of the currently allocated
+    objects. The freeing traces thus come from the previous life-cycle of the
+    objects and are reported as not available for objects allocated for the first
+    time. The output is sorted by frequency of each trace.
+
+    Information in the output:
+    Number of objects, freeing function, minimal/average/maximal jiffies since free,
+    pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
+
+    Example:::
+
+    1980 <not-available> age=4294912290 pid=0 cpus=0
+    51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
+	kfree+0x2db/0x420
+	acpi_ut_update_ref_count+0x6a6/0x782
+	acpi_ut_update_object_reference+0x1ad/0x234
+	acpi_ut_remove_reference+0x7d/0x84
+	acpi_rs_get_prt_method_data+0x97/0xd6
+	acpi_get_irq_routing_table+0x82/0xc4
+	acpi_pci_irq_find_prt_entry+0x8e/0x2e0
+	acpi_pci_irq_lookup+0x3a/0x1e0
+	acpi_pci_irq_enable+0x77/0x240
+	pcibios_enable_device+0x39/0x40
+	do_pci_enable_device.part.0+0x5d/0xe0
+	pci_enable_device_flags+0xfc/0x120
+	pci_enable_device+0x13/0x20
+	virtio_pci_probe+0x9e/0x170
+	local_pci_probe+0x48/0x80
+	pci_device_probe+0x105/0x1c0
+
+Christoph Lameter, May 30, 2007
+Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst
new file mode 100644
index 000000000000..c08919662704
--- /dev/null
+++ b/Documentation/mm/split_page_table_lock.rst
@@ -0,0 +1,100 @@
+.. _split_page_table_lock:
+
+=====================
+Split page table lock
+=====================
+
+Originally, mm->page_table_lock spinlock protected all page tables of the
+mm_struct. But this approach leads to poor page fault scalability of
+multi-threaded applications due high contention on the lock. To improve
+scalability, split page table lock was introduced.
+
+With split page table lock we have separate per-table lock to serialize
+access to the table. At the moment we use split lock for PTE and PMD
+tables. Access to higher level tables protected by mm->page_table_lock.
+
+There are helpers to lock/unlock a table and other accessor functions:
+
+ - pte_offset_map_lock()
+	maps pte and takes PTE table lock, returns pointer to the taken
+	lock;
+ - pte_unmap_unlock()
+	unlocks and unmaps PTE table;
+ - pte_alloc_map_lock()
+	allocates PTE table if needed and take the lock, returns pointer
+	to taken lock or NULL if allocation failed;
+ - pte_lockptr()
+	returns pointer to PTE table lock;
+ - pmd_lock()
+	takes PMD table lock, returns pointer to taken lock;
+ - pmd_lockptr()
+	returns pointer to PMD table lock;
+
+Split page table lock for PTE tables is enabled compile-time if
+CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
+If split lock is disabled, all tables are guarded by mm->page_table_lock.
+
+Split page table lock for PMD tables is enabled, if it's enabled for PTE
+tables and the architecture supports it (see below).
+
+Hugetlb and split page table lock
+=================================
+
+Hugetlb can support several page sizes. We use split lock only for PMD
+level, but not for PUD.
+
+Hugetlb-specific helpers:
+
+ - huge_pte_lock()
+	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
+	otherwise;
+ - huge_pte_lockptr()
+	returns pointer to table lock;
+
+Support of split page table lock by an architecture
+===================================================
+
+There's no need in special enabling of PTE split page table lock: everything
+required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+must be called on PTE table allocation / freeing.
+
+Make sure the architecture doesn't use slab allocator for page table
+allocation: slab uses page->slab_cache for its pages.
+This field shares storage with page->ptl.
+
+PMD split lock only makes sense if you have more than two page table
+levels.
+
+PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
+allocation and pgtable_pmd_page_dtor() on freeing.
+
+Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
+pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
+paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
+
+With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
+
+NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+be handled properly.
+
+page->ptl
+=========
+
+page->ptl is used to access split page table lock, where 'page' is struct
+page of page containing the table. It shares storage with page->private
+(and few other fields in union).
+
+To avoid increasing size of struct page and have best performance, we use a
+trick:
+
+ - if spinlock_t fits into long, we use page->ptr as spinlock, so we
+   can avoid indirect access and save a cache line.
+ - if size of spinlock_t is bigger then size of long, we use page->ptl as
+   pointer to spinlock_t and allocate it dynamically. This allows to use
+   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
+   one more cache line for indirect access;
+
+The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
+pgtable_pmd_page_ctor() for PMD table.
+
+Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/mm/swap.rst b/Documentation/mm/swap.rst
new file mode 100644
index 000000000000..78819bd4d745
--- /dev/null
+++ b/Documentation/mm/swap.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+Swap
+====
diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst
new file mode 100644
index 000000000000..216db1d67d04
--- /dev/null
+++ b/Documentation/mm/transhuge.rst
@@ -0,0 +1,187 @@
+.. _transhuge:
+
+============================
+Transparent Hugepage Support
+============================
+
+This document describes design principles for Transparent Hugepage (THP)
+support and its interaction with other parts of the memory management
+system.
+
+Design principles
+=================
+
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
+
+- if a hugepage allocation fails because of memory fragmentation,
+  regular pages should be gracefully allocated instead and mixed in
+  the same vma without any failure or significant delay and without
+  userland noticing
+
+- if some task quits and more hugepages become available (either
+  immediately in the buddy or through the VM), guest physical memory
+  backed by regular pages should be relocated on hugepages
+  automatically (with khugepaged)
+
+- it doesn't require memory reservation and in turn it uses hugepages
+  whenever possible (the only possible reservation here is kernelcore=
+  to avoid unmovable pages to fragment all the memory but such a tweak
+  is not specific to transparent hugepage support and it's a generic
+  feature that applies to all dynamic high order allocations in the
+  kernel)
+
+get_user_pages and follow_page
+==============================
+
+get_user_pages and follow_page if run on a hugepage, will return the
+head or tail pages as usual (exactly as they would do on
+hugetlbfs). Most GUP users will only care about the actual physical
+address of the page and its temporary pinning to release after the I/O
+is complete, so they won't ever notice the fact the page is huge. But
+if any driver is going to mangle over the page structure of the tail
+page (like for checking page->mapping or other bits that are relevant
+for the head page and not the tail page), it should be updated to jump
+to check head page instead. Taking a reference on any head/tail page would
+prevent the page from being split by anyone.
+
+.. note::
+   these aren't new constraints to the GUP API, and they match the
+   same constraints that apply to hugetlbfs too, so any driver capable
+   of handling GUP on hugetlbfs will also work fine on transparent
+   hugepage backed mappings.
+
+Graceful fallback
+=================
+
+Code walking pagetables but unaware about huge pmds can simply call
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
+pmd_offset. It's trivial to make the code transparent hugepage aware
+by just grepping for "pmd_offset" and adding split_huge_pmd where
+missing after pmd_offset returns the pmd. Thanks to the graceful
+fallback design, with a one liner change, you can avoid to write
+hundreds if not thousands of lines of complex code to make your code
+hugepage aware.
+
+If you're not walking pagetables but you run into a physical hugepage
+that you can't handle natively in your code, you can split it by
+calling split_huge_page(page). This is what the Linux VM does before
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
+
+Example to make mremap.c transparent hugepage aware with a one liner
+change::
+
+	diff --git a/mm/mremap.c b/mm/mremap.c
+	--- a/mm/mremap.c
+	+++ b/mm/mremap.c
+	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
+			return NULL;
+
+		pmd = pmd_offset(pud, addr);
+	+	split_huge_pmd(vma, pmd, addr);
+		if (pmd_none_or_clear_bad(pmd))
+			return NULL;
+
+Locking in hugepage aware code
+==============================
+
+We want as much code as possible hugepage aware, as calling
+split_huge_page() or split_huge_pmd() has a cost.
+
+To make pagetable walks huge pmd aware, all you need to do is to call
+pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
+mmap_lock in read (or write) mode to be sure a huge pmd cannot be
+created from under you by khugepaged (khugepaged collapse_huge_page
+takes the mmap_lock in write mode in addition to the anon_vma lock). If
+pmd_trans_huge returns false, you just fallback in the old code
+paths. If instead pmd_trans_huge returns true, you have to take the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd being converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
+pagetable walk). If the second pmd_trans_huge returns false, you
+should just drop the page table lock and fallback to the old code as
+before. Otherwise, you can proceed to process the huge pmd and the
+hugepage natively. Once finished, you can drop the page table lock.
+
+Refcounts and transparent huge pages
+====================================
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate on head page's ->_refcount.
+
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
+    succeeds on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page is accounted for in compound_mapcount
+    (stored in first tail page). For file huge pages, we also increment
+    ->_mapcount of all sub-pages in order to have race-free detection of
+    last unmap of subpages.
+
+PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
+
+For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all
+subpages is offset up by one. This additional reference is required to
+get race-free detection of unmap of subpages when we have them mapped with
+both PMDs and PTEs.
+
+This optimization is required to lower the overhead of per-subpage mapcount
+tracking. The alternative is to alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+For anonymous pages, we set PG_double_map when a PMD of the page is split
+for the first time, but still have a PMD mapping. The additional references
+go away with the last compound_mapcount.
+
+File pages get PG_double_map set on the first map of the page with PTE and
+goes away when the page gets evicted from the page cache.
+
+split_huge_page internally has to distribute the refcounts in the head
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries, but we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge pages: it expects page count to be equal to
+the sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have a reference to the head page).
+
+split_huge_page uses migration entries to stabilize page->_refcount and
+page->_mapcount of anonymous pages. File pages just get unmapped.
+
+We are safe against physical memory scanners too: the only legitimate way
+a scanner can get a reference to a page is get_page_unless_zero().
+
+All tail pages have zero ->_refcount until atomic_add(). This prevents the
+scanner from getting a reference to the tail page up to that point. After the
+atomic_add() we don't care about the ->_refcount value. We already know how
+many references should be uncharged from the head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where references should go after split: it will stay on the head page.
+
+Note that split_huge_pmd() doesn't have any limitations on refcounting:
+pmd can be split at any point and never fails.
+
+Partial unmap and deferred_split_huge_page()
+============================================
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It also might be
+counterproductive since in many cases partial unmap happens during exit(2) if
+a THP crosses a VMA boundary.
+
+The function deferred_split_huge_page() is used to queue a page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst
new file mode 100644
index 000000000000..b280367d6a44
--- /dev/null
+++ b/Documentation/mm/unevictable-lru.rst
@@ -0,0 +1,554 @@
+.. _unevictable_lru:
+
+==============================
+Unevictable LRU Infrastructure
+==============================
+
+.. contents:: :local:
+
+
+Introduction
+============
+
+This document describes the Linux memory manager's "Unevictable LRU"
+infrastructure and the use of this to manage several types of "unevictable"
+pages.
+
+The document attempts to provide the overall rationale behind this mechanism
+and the rationale for some of the design decisions that drove the
+implementation.  The latter design rationale is discussed in the context of an
+implementation description.  Admittedly, one can obtain the implementation
+details - the "what does it do?" - by reading the code.  One hopes that the
+descriptions below add value by provide the answer to "why does it do that?".
+
+
+
+The Unevictable LRU
+===================
+
+The Unevictable LRU facility adds an additional LRU list to track unevictable
+pages and to hide these pages from vmscan.  This mechanism is based on a patch
+by Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux.  The problems have been observed at customer sites on large
+memory x86_64 systems.
+
+To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
+main memory will have over 32 million 4k pages in a single node.  When a large
+fraction of these pages are not evictable for any reason [see below], vmscan
+will spend a lot of time scanning the LRU lists looking for the small fraction
+of pages that are evictable.  This can result in a situation where all CPUs are
+spending 100% of their time in vmscan for hours or days on end, with the system
+completely unresponsive.
+
+The unevictable list addresses the following classes of unevictable pages:
+
+ * Those owned by ramfs.
+
+ * Those mapped into SHM_LOCK'd shared memory regions.
+
+ * Those mapped into VM_LOCKED [mlock()ed] VMAs.
+
+The infrastructure may also be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+The Unevictable LRU Page List
+-----------------------------
+
+The Unevictable LRU page list is a lie.  It was never an LRU-ordered list, but a
+companion to the LRU-ordered anonymous and file, active and inactive page lists;
+and now it is not even a page list.  But following familiar convention, here in
+this document and in the source, we often imagine it as a fifth LRU page list.
+
+The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list.
+
+The PG_unevictable flag is analogous to, and mutually exclusive with, the
+PG_active flag in that it indicates on which LRU list a page resides when
+PG_lru is set.
+
+The Unevictable LRU infrastructure maintains unevictable pages as if they were
+on an additional LRU list for a few reasons:
+
+ (1) We get to "treat unevictable pages just like we treat other pages in the
+     system - which means we get to use the same code to manipulate them, the
+     same code to isolate them (for migrate, etc.), the same code to keep track
+     of the statistics, etc..." [Rik van Riel]
+
+ (2) We want to be able to migrate unevictable pages between nodes for memory
+     defragmentation, workload management and memory hotplug.  The Linux kernel
+     can only migrate pages that it can successfully isolate from the LRU
+     lists (or "Movable" pages: outside of consideration here).  If we were to
+     maintain pages elsewhere than on an LRU-like list, where they can be
+     detected by isolate_lru_page(), we would prevent their migration.
+
+The unevictable list does not differentiate between file-backed and anonymous,
+swap-backed pages.  This differentiation is only important while the pages are,
+in fact, evictable.
+
+The unevictable list benefits from the "arrayification" of the per-node LRU
+lists and statistics originally proposed and posted by Christoph Lameter.
+
+
+Memory Control Group Interaction
+--------------------------------
+
+The unevictable LRU facility interacts with the memory control group [aka
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by
+extending the lru_list enum.
+
+The memory controller data structure automatically gets a per-node unevictable
+list as a result of the "arrayification" of the per-node LRU lists (one per
+lru_list enum element).  The memory controller tracks the movement of pages to
+and from the unevictable list.
+
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list.  This has a couple of
+effects:
+
+ (1) Because the pages are "hidden" from reclaim on the unevictable list, the
+     reclaim process can be more efficient, dealing only with pages that have a
+     chance of being reclaimed.
+
+ (2) On the other hand, if too many of the pages charged to the control group
+     are unevictable, the evictable portion of the working set of the tasks in
+     the control group may not fit into the available memory.  This can cause
+     the control group to thrash or to OOM-kill tasks.
+
+
+.. _mark_addr_space_unevict:
+
+Marking Address Spaces Unevictable
+----------------------------------
+
+For facilities such as ramfs none of the pages attached to the address space
+may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
+address space flag is provided, and this can be manipulated by a filesystem
+using a number of wrapper functions:
+
+ * ``void mapping_set_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being completely unevictable.
+
+ * ``void mapping_clear_unevictable(struct address_space *mapping);``
+
+	Mark the address space as being evictable.
+
+ * ``int mapping_unevictable(struct address_space *mapping);``
+
+	Query the address space, and return true if it is completely
+	unevictable.
+
+These are currently used in three places in the kernel:
+
+ (1) By ramfs to mark the address spaces of its inodes when they are created,
+     and this mark remains for the life of the inode.
+
+ (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
+     Note that SHM_LOCK is not required to page in the locked pages if they're
+     swapped out; the application must touch the pages manually if it wants to
+     ensure they're in memory.
+
+ (3) By the i915 driver to mark pinned address space until it's unpinned. The
+     amount of unevictable memory marked by i915 driver is roughly the bounded
+     object size in debugfs/dri/0/i915_gem_objects.
+
+
+Detecting Unevictable Pages
+---------------------------
+
+The function page_evictable() in mm/internal.h determines whether a page is
+evictable or not using the query function outlined above [see section
+:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
+to check the AS_UNEVICTABLE flag.
+
+For address spaces that are so marked after being populated (as SHM regions
+might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate
+the page tables for the region as does, for example, mlock(), nor need it make
+any special effort to push any pages in the SHM_LOCK'd area to the unevictable
+list.  Instead, vmscan will do this if and when it encounters the pages during
+a reclamation scan.
+
+On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan
+the pages in the region and "rescue" them from the unevictable list if no other
+condition is keeping them unevictable.  If an unevictable region is destroyed,
+the pages are also "rescued" from the unevictable list in the process of
+freeing them.
+
+page_evictable() also checks for mlocked pages by testing an additional page
+flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
+faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED.
+
+
+Vmscan's Handling of Unevictable Pages
+--------------------------------------
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will not encounter the pages until they
+have become evictable again (via munlock() for example) and have been "rescued"
+from the unevictable list.  However, there may be situations where we decide,
+for the sake of expediency, to leave an unevictable page on one of the regular
+active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
+pages in all of the shrink_{active|inactive|page}_list() functions and will
+"cull" such pages that it encounters: that is, it diverts those pages to the
+unevictable list for the memory cgroup and node being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED VMA, but the
+page is not marked as PG_mlocked.  Such pages will make it all the way to
+shrink_active_list() or shrink_page_list() where they will be detected when
+vmscan walks the reverse map in page_referenced() or try_to_unmap().  The page
+is culled to the unevictable list when it is released by the shrinker.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
+using putback_lru_page() - the inverse operation to isolate_lru_page() - after
+dropping the page lock.  Because the condition which makes the page unevictable
+may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the
+unevictable state of a page before placing it on the unevictable list.
+
+
+MLOCKED Pages
+=============
+
+The unevictable page list is also useful for mlock(), in addition to ramfs and
+SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
+NOMMU situations, all mappings are effectively mlocked.
+
+
+History
+-------
+
+The "Unevictable mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph Lameter
+to achieve the same objective: hiding mlocked pages from vmscan.
+
+In Nick's patch, he used one of the struct page LRU list link fields as a count
+of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years
+earlier).  But this use of the link field for a count prevented the management
+of the pages on an LRU list, and thus mlocked pages were not migratable as
+isolate_lru_page() could not detect them, and the LRU list link field was not
+available to the migration subsystem.
+
+Nick resolved this by putting mlocked pages back on the LRU list before
+attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
+Nick's patch was integrated with the Unevictable LRU work, the count was
+replaced by walking the reverse map when munlocking, to determine whether any
+other VM_LOCKED VMAs still mapped the page.
+
+However, walking the reverse map for each page when munlocking was ugly and
+inefficient, and could lead to catastrophic contention on a file's rmap lock,
+when many processes which had it mlocked were trying to exit.  In 5.18, the
+idea of keeping mlock_count in Unevictable LRU list link field was revived and
+put to work, without preventing the migration of mlocked pages.  This is why
+the "Unevictable LRU list" cannot be a linked list of pages now; but there was
+no use for that linked list anyway - though its size is maintained for meminfo.
+
+
+Basic Management
+----------------
+
+mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
+pages.  When such a page has been "noticed" by the memory management subsystem,
+the page is marked with the PG_mlocked flag.  This can be manipulated using the
+PageMlocked() functions.
+
+A PG_mlocked page will be placed on the unevictable list when it is added to
+the LRU.  Such pages can be "noticed" by memory management in several places:
+
+ (1) in the mlock()/mlock2()/mlockall() system call handlers;
+
+ (2) in the mmap() system call handler when mmapping a region with the
+     MAP_LOCKED flag;
+
+ (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
+     flag;
+
+ (4) in the fault path and when a VM_LOCKED stack segment is expanded; or
+
+ (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
+     reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
+
+mlocked pages become unlocked and rescued from the unevictable list when:
+
+ (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
+
+ (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
+     unmapping at task exit;
+
+ (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
+     or
+
+ (4) before a page is COW'd in a VM_LOCKED VMA.
+
+
+mlock()/mlock2()/mlockall() System Call Handling
+------------------------------------------------
+
+mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup()
+for each VMA in the range specified by the call.  In the case of mlockall(),
+this is the entire active address space of the task.  Note that mlock_fixup()
+is used for both mlocking and munlocking a range of memory.  A call to mlock()
+an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is
+treated as a no-op and mlock_fixup() simply returns.
+
+If the VMA passes some filtering as described in "Filtering Special VMAs"
+below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
+off a subset of the VMA if the range does not cover the entire VMA.  Any pages
+already present in the VMA are then marked as mlocked by mlock_page() via
+mlock_pte_range() via walk_page_range() via mlock_vma_pages_range().
+
+Before returning from the system call, do_mlock() or mlockall() will call
+__mm_populate() to fault in the remaining pages via get_user_pages() and to
+mark those pages as mlocked as they are faulted.
+
+Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
+get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
+do end up getting faulted into this VM_LOCKED VMA, they will be handled in the
+fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled.
+
+For each PTE (or PMD) being faulted into a VMA, the page add rmap function
+calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED
+(unless it is a PTE mapping of a part of a transparent huge page).  Or when
+it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable()
+calls mlock_new_page() instead: similar to mlock_page(), but can make better
+judgments, since this page is held exclusively and known not to be on LRU yet.
+
+mlock_page() sets PageMlocked immediately, then places the page on the CPU's
+mlock pagevec, to batch up the rest of the work to be done under lru_lock by
+__mlock_page().  __mlock_page() sets PageUnevictable, initializes mlock_count
+and moves the page to unevictable state ("the unevictable LRU", but with
+mlock_count in place of LRU threading).  Or if the page was already PageLRU
+and PageUnevictable and PageMlocked, it simply increments the mlock_count.
+
+But in practice that may not work ideally: the page may not yet be on an LRU, or
+it may have been temporarily isolated from LRU.  In such cases the mlock_count
+field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn()
+returns the page to "LRU".  Races prohibit mlock_count from being set to 1 then:
+rather than risk stranding a page indefinitely as unevictable, always err with
+mlock_count on the low side, so that when munlocked the page will be rescued to
+an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a
+VM_LOCKED VMA.
+
+
+Filtering Special VMAs
+----------------------
+
+mlock_fixup() filters several classes of "special" VMAs:
+
+1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
+   these mappings are inherently pinned, so we don't need to mark them as
+   mlocked.  In any case, most of the pages have no struct page in which to so
+   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
+   so there is no sense in attempting to visit them.
+
+2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
+   neither need nor want to mlock() these pages.  But __mm_populate() includes
+   hugetlbfs ranges, allocating the huge pages and populating the PTEs.
+
+3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
+   such as the VDSO page, relay channel pages, etc.  These pages are inherently
+   unevictable and are not managed on the LRU lists.  __mm_populate() includes
+   these ranges, populating the PTEs if not already populated.
+
+4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate()
+   includes these ranges, populating the PTEs if not already populated.
+
+Note that for all of these special VMAs, mlock_fixup() does not set the
+VM_LOCKED flag.  Therefore, we won't have to deal with them later during
+munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
+VMAs against the task's "locked_vm".
+
+
+munlock()/munlockall() System Call Handling
+-------------------------------------------
+
+The munlock() and munlockall() system calls are handled by the same
+mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are.
+If called to munlock an already munlocked VMA, mlock_fixup() simply returns.
+Because of the VMA filtering discussed above, VM_LOCKED will not be set in
+any "special" VMAs.  So, those VMAs will be ignored for munlock.
+
+If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
+specified range.  All pages in the VMA are then munlocked by munlock_page() via
+mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same
+function used when mlocking a VMA range, with new flags for the VMA indicating
+that it is munlock() being performed.
+
+munlock_page() uses the mlock pagevec to batch up work to be done under
+lru_lock by  __munlock_page().  __munlock_page() decrements the page's
+mlock_count, and when that reaches 0 it clears PageMlocked and clears
+PageUnevictable, moving the page from unevictable state to inactive LRU.
+
+But in practice that may not work ideally: the page may not yet have reached
+"the unevictable LRU", or it may have been temporarily isolated from it.  In
+those cases its mlock_count field is unusable and must be assumed to be 0: so
+that the page will be rescued to an evictable LRU, then perhaps be mlocked
+again later if vmscan finds it in a VM_LOCKED VMA.
+
+
+Migrating MLOCKED Pages
+-----------------------
+
+A page that is being migrated has been isolated from the LRU lists and is held
+locked across unmapping of the page, updating the page's address space entry
+and copying the contents and state, until the page table entry has been
+replaced with an entry that refers to the new page.  Linux supports migration
+of mlocked pages and other unevictable pages.  PG_mlocked is cleared from the
+the old page when it is unmapped from the last VM_LOCKED VMA, and set when the
+new page is mapped in place of migration entry in a VM_LOCKED VMA.  If the page
+was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the
+page was unevictable for other reasons, PG_unevictable is copied explicitly.
+
+Note that page migration can race with mlocking or munlocking of the same page.
+There is mostly no problem since page migration requires unmapping all PTEs of
+the old page (including munlock where VM_LOCKED), then mapping in the new page
+(including mlock where VM_LOCKED).  The page table locks provide sufficient
+synchronization.
+
+However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA,
+before mlocking any pages already present, if one of those pages were migrated
+before mlock_pte_range() reached it, it would get counted twice in mlock_count.
+To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO,
+so that mlock_vma_page() will skip it.
+
+To complete page migration, we place the old and new pages back onto the LRU
+afterwards.  The "unneeded" page - old page on success, new page on failure -
+is freed when the reference count held by the migration process is released.
+
+
+Compacting MLOCKED Pages
+------------------------
+
+The memory map can be scanned for compactable regions and the default behavior
+is to let unevictable pages be moved.  /proc/sys/vm/compact_unevictable_allowed
+controls this behavior (see Documentation/admin-guide/sysctl/vm.rst).  The work
+of compaction is mostly handled by the page migration code and the same work
+flow as described in Migrating MLOCKED Pages will apply.
+
+
+MLOCKING Transparent Huge Pages
+-------------------------------
+
+A transparent huge page is represented by a single entry on an LRU list.
+Therefore, we can only make unevictable an entire compound page, not
+individual subpages.
+
+If a user tries to mlock() part of a huge page, and no user mlock()s the
+whole of the huge page, we want the rest of the page to be reclaimable.
+
+We cannot just split the page on partial mlock() as split_huge_page() can
+fail and a new intermittent failure mode for the syscall is undesirable.
+
+We handle this by keeping PTE-mlocked huge pages on evictable LRU lists:
+the PMD on the border of a VM_LOCKED VMA will be split into a PTE table.
+
+This way the huge page is accessible for vmscan.  Under memory pressure the
+page will be split, subpages which belong to VM_LOCKED VMAs will be moved
+to the unevictable LRU and the rest can be reclaimed.
+
+/proc/meminfo's Unevictable and Mlocked amounts do not include those parts
+of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs.
+
+
+mmap(MAP_LOCKED) System Call Handling
+-------------------------------------
+
+In addition to the mlock(), mlock2() and mlockall() system calls, an application
+can request that a region of memory be mlocked by supplying the MAP_LOCKED flag
+to the mmap() call.  There is one important and subtle difference here, though.
+mmap() + mlock() will fail if the range cannot be faulted in (e.g. because
+mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail.
+The mmaped area will still have properties of the locked area - pages will not
+get swapped out - but major page faults to fault memory in might still happen.
+
+Furthermore, any mmap() call or brk() call that expands the heap by a task
+that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked.  Before the unevictable/mlock
+changes, the kernel simply called make_pages_present() to allocate pages
+and populate the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure,
+the mmap() handler and task address space expansion functions call
+populate_vma_page_range() specifying the vma and the address range to mlock.
+
+
+munmap()/exit()/exec() System Call Handling
+-------------------------------------------
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any
+way, so unmapping them required no processing.
+
+For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
+munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
+(unless it was a PTE mapping of a part of a transparent huge page).
+
+munlock_page() uses the mlock pagevec to batch up work to be done under
+lru_lock by  __munlock_page().  __munlock_page() decrements the page's
+mlock_count, and when that reaches 0 it clears PageMlocked and clears
+PageUnevictable, moving the page from unevictable state to inactive LRU.
+
+But in practice that may not work ideally: the page may not yet have reached
+"the unevictable LRU", or it may have been temporarily isolated from it.  In
+those cases its mlock_count field is unusable and must be assumed to be 0: so
+that the page will be rescued to an evictable LRU, then perhaps be mlocked
+again later if vmscan finds it in a VM_LOCKED VMA.
+
+
+Truncating MLOCKED Pages
+------------------------
+
+File truncation or hole punching forcibly unmaps the deleted pages from
+userspace; truncation even unmaps and deletes any private anonymous pages
+which had been Copied-On-Write from the file pages now being truncated.
+
+Mlocked pages can be munlocked and deleted in this way: like with munmap(),
+for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
+munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
+(unless it was a PTE mapping of a part of a transparent huge page).
+
+However, if there is a racing munlock(), since mlock_vma_pages_range() starts
+munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages
+present, if one of those pages were unmapped by truncation or hole punch before
+mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA,
+and would not be counted out of mlock_count.  In this rare case, a page may
+still appear as PageMlocked after it has been fully unmapped: and it is left to
+release_pages() (or __page_cache_release()) to clear it and update statistics
+before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared,
+which is usually 0).
+
+
+Page Reclaim in shrink_*_list()
+-------------------------------
+
+vmscan's shrink_active_list() culls any obviously unevictable pages -
+i.e. !page_evictable(page) pages - diverting those to the unevictable list.
+However, shrink_active_list() only sees unevictable pages that made it onto the
+active/inactive LRU lists.  Note that these pages do not have PageUnevictable
+set - otherwise they would be on the unevictable list and shrink_active_list()
+would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+ (1) ramfs pages that have been placed on the LRU lists when first allocated.
+
+ (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
+     allocate or fault in the pages in the shared memory region.  This happens
+     when an application accesses the page the first time after SHM_LOCK'ing
+     the segment.
+
+ (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked,
+     but events left mlock_count too low, so they were munlocked too early.
+
+vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
+unevictable pages found on the inactive lists to the appropriate memory cgroup
+and node unevictable list.
+
+rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
+shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
+check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
+to correct them.  Such pages are culled to the unevictable list when released
+by the shrinker.
diff --git a/Documentation/mm/vmalloc.rst b/Documentation/mm/vmalloc.rst
new file mode 100644
index 000000000000..363fe20d6b9f
--- /dev/null
+++ b/Documentation/mm/vmalloc.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+Virtually Contiguous Memory Allocation
+======================================
diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst
new file mode 100644
index 000000000000..fc8c67833af6
--- /dev/null
+++ b/Documentation/mm/vmalloced-kernel-stacks.rst
@@ -0,0 +1,153 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Virtually Mapped Kernel Stack Support
+=====================================
+
+:Author: Shuah Khan <skhan@linuxfoundation.org>
+
+.. contents:: :local:
+
+Overview
+--------
+
+This is a compilation of information from the code and original patch
+series that introduced the `Virtually Mapped Kernel Stacks feature
+<https://lwn.net/Articles/694348/>`
+
+Introduction
+------------
+
+Kernel stack overflows are often hard to debug and make the kernel
+susceptible to exploits. Problems could show up at a later time making
+it difficult to isolate and root-cause.
+
+Virtually-mapped kernel stacks with guard pages causes kernel stack
+overflows to be caught immediately rather than causing difficult to
+diagnose corruptions.
+
+HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
+support for virtually mapped stacks with guard pages. This feature
+causes reliable faults when the stack overflows. The usability of
+the stack trace after overflow and response to the overflow itself
+is architecture dependent.
+
+.. note::
+        As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
+        support for VMAP_STACK.
+
+HAVE_ARCH_VMAP_STACK
+--------------------
+
+Architectures that can support Virtually Mapped Kernel Stacks should
+enable this bool configuration option. The requirements are:
+
+- vmalloc space must be large enough to hold many kernel stacks. This
+  may rule out many 32-bit architectures.
+- Stacks in vmalloc space need to work reliably.  For example, if
+  vmap page tables are created on demand, either this mechanism
+  needs to work while the stack points to a virtual address with
+  unpopulated page tables or arch code (switch_to() and switch_mm(),
+  most likely) needs to ensure that the stack's page table entries
+  are populated before running on a possibly unpopulated stack.
+- If the stack overflows into a guard page, something reasonable
+  should happen. The definition of "reasonable" is flexible, but
+  instantly rebooting without logging anything would be unfriendly.
+
+VMAP_STACK
+----------
+
+VMAP_STACK bool configuration option when enabled allocates virtually
+mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
+
+- Enable this if you want the use virtually-mapped kernel stacks
+  with guard pages. This causes kernel stack overflows to be caught
+  immediately rather than causing difficult-to-diagnose corruption.
+
+.. note::
+
+        Using this feature with KASAN requires architecture support
+        for backing virtual mappings with real shadow memory, and
+        KASAN_VMALLOC must be enabled.
+
+.. note::
+
+        VMAP_STACK is enabled, it is not possible to run DMA on stack
+        allocated data.
+
+Kernel configuration options and dependencies keep changing. Refer to
+the latest code base:
+
+`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
+
+Allocation
+-----------
+
+When a new kernel thread is created, thread stack is allocated from
+virtually contiguous memory pages from the page level allocator. These
+pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
+protections.
+
+alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
+with PAGE_KERNEL protections.
+
+- Allocated stacks are cached and later reused by new threads, so memcg
+  accounting is performed manually on assigning/releasing stacks to tasks.
+  Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
+- vm_struct is cached to be able to find when thread free is initiated
+  in interrupt context. free_thread_stack() can be called in interrupt
+  context.
+- On arm64, all VMAP's stacks need to have the same alignment to ensure
+  that VMAP'd stack overflow detection works correctly. Arch specific
+  vmap stack allocator takes care of this detail.
+- This does not address interrupt stacks - according to the original patch
+
+Thread stack allocation is initiated from clone(), fork(), vfork(),
+kernel_thread() via kernel_clone(). Leaving a few hints for searching
+the code base to understand when and how thread stack is allocated.
+
+Bulk of the code is in:
+`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
+
+stack_vm_area pointer in task_struct keeps track of the virtually allocated
+stack and a non-null stack_vm_area pointer serves as a indication that the
+virtually mapped kernel stacks are enabled.
+
+::
+
+        struct vm_struct *stack_vm_area;
+
+Stack overflow handling
+-----------------------
+
+Leading and trailing guard pages help detect stack overflows. When stack
+overflows into the guard pages, handlers have to be careful not overflow
+the stack again. When handlers are called, it is likely that very little
+stack space is left.
+
+On x86, this is done by handling the page fault indicating the kernel
+stack overflow on the double-fault stack.
+
+Testing VMAP allocation with guard pages
+----------------------------------------
+
+How do we ensure that VMAP_STACK is actually allocating with a leading
+and trailing guard page? The following lkdtm tests can help detect any
+regressions.
+
+::
+
+        void lkdtm_STACK_GUARD_PAGE_LEADING()
+        void lkdtm_STACK_GUARD_PAGE_TRAILING()
+
+Conclusions
+-----------
+
+- A percpu cache of vmalloced stacks appears to be a bit faster than a
+  high-order stack allocation, at least when the cache hits.
+- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
+  simply embed the thread_info (containing only flags) and 'int cpu' into
+  task_struct.
+- The thread stack can be free'ed as soon as the task is dead (without
+  waiting for RCU) and then, if vmapped stacks are in use, cache the
+  entire stack for reuse on the same cpu.
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
new file mode 100644
index 000000000000..c9c495f62d12
--- /dev/null
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -0,0 +1,223 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================================
+A vmemmap diet for HugeTLB and Device DAX
+=========================================
+
+HugeTLB
+=======
+
+The struct page structures (page structs) are used to describe a physical
+page frame. By default, there is a one-to-one mapping from a page frame to
+it's corresponding page struct.
+
+HugeTLB pages consist of multiple base page size pages and is supported by many
+architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
+details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
+currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
+consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
+For each base page, there is a corresponding page struct.
+
+Within the HugeTLB subsystem, only the first 4 page structs are used to
+contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+this upper limit. The only 'useful' information in the remaining page structs
+is the compound_head field, and this field is the same for all tail pages.
+
+By removing redundant page structs for HugeTLB pages, memory can be returned
+to the buddy allocator for other uses.
+
+Different architectures support different HugeTLB pages. For example, the
+following table is the HugeTLB page size supported by x86 and arm64
+architectures. Because arm64 supports 4k, 16k, and 64k base pages and
+supports contiguous entries, so it supports many kinds of sizes of HugeTLB
+page.
+
++--------------+-----------+-----------------------------------------------+
+| Architecture | Page Size |                HugeTLB Page Size              |
++--------------+-----------+-----------+-----------+-----------+-----------+
+|    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
++--------------+-----------+-----------+-----------+-----------+-----------+
+|              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
+|              +-----------+-----------+-----------+-----------+-----------+
+|    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
+|              +-----------+-----------+-----------+-----------+-----------+
+|              |   64KB    |    2MB    |  512MB    |    16GB   |           |
++--------------+-----------+-----------+-----------+-----------+-----------+
+
+When the system boot up, every HugeTLB page has more than one struct page
+structs which size is (unit: pages)::
+
+   struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+
+Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
+of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
+relationship::
+
+   HugeTLB_Size = n * PAGE_SIZE
+
+Then::
+
+   struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+               = n * sizeof(struct page) / PAGE_SIZE
+
+We can use huge mapping at the pud/pmd level for the HugeTLB page.
+
+For the HugeTLB page of the pmd level mapping, then::
+
+   struct_size = n * sizeof(struct page) / PAGE_SIZE
+               = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
+               = sizeof(struct page) / sizeof(pte_t)
+               = 64 / 8
+               = 8 (pages)
+
+Where n is how many pte entries which one page can contains. So the value of
+n is (PAGE_SIZE / sizeof(pte_t)).
+
+This optimization only supports 64-bit system, so the value of sizeof(pte_t)
+is 8. And this optimization also applicable only when the size of struct page
+is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
+size of struct page structs of it is 8 page frames which size depends on the
+size of the base page.
+
+For the HugeTLB page of the pud level mapping, then::
+
+   struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
+               = PAGE_SIZE / 8 * 8 (pages)
+               = PAGE_SIZE (pages)
+
+Where the struct_size(pmd) is the size of the struct page structs of a
+HugeTLB page of the pmd level mapping.
+
+E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
+HugeTLB page consists in 4096.
+
+Next, we take the pmd level mapping of the HugeTLB page as an example to
+show the internal implementation of this optimization. There are 8 pages
+struct page structs associated with a HugeTLB page which is pmd mapped.
+
+Here is how things look before optimization::
+
+    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | -------------> |     2     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     3     | -------------> |     3     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     4     | -------------> |     4     |
+ |    PMD    |                     +-----------+                +-----------+
+ |   level   |                     |     5     | -------------> |     5     |
+ |  mapping  |                     +-----------+                +-----------+
+ |           |                     |     6     | -------------> |     6     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     7     | -------------> |     7     |
+ |           |                     +-----------+                +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+The value of page->compound_head is the same for all tail pages. The first
+page of page structs (page 0) associated with the HugeTLB page contains the 4
+page structs necessary to describe the HugeTLB. The only use of the remaining
+pages of page structs (page 1 to page 7) is to point to page->compound_head.
+Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
+will be used for each HugeTLB page. This will allow us to free the remaining
+7 pages to the buddy allocator.
+
+Here is how things look after remapping::
+
+    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                  | | | | | |
+ |           |                     |     2     | -----------------+ | | | | |
+ |           |                     +-----------+                    | | | | |
+ |           |                     |     3     | -------------------+ | | | |
+ |           |                     +-----------+                      | | | |
+ |           |                     |     4     | ---------------------+ | | |
+ |    PMD    |                     +-----------+                        | | |
+ |   level   |                     |     5     | -----------------------+ | |
+ |  mapping  |                     +-----------+                          | |
+ |           |                     |     6     | -------------------------+ |
+ |           |                     +-----------+                            |
+ |           |                     |     7     | ---------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
+vmemmap pages and restore the previous mapping relationship.
+
+For the HugeTLB page of the pud level mapping. It is similar to the former.
+We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
+
+Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
+(e.g. aarch64) provides a contiguous bit in the translation table entries
+that hints to the MMU to indicate that it is one of a contiguous set of
+entries that can be cached in a single TLB entry.
+
+The contiguous bit is used to increase the mapping size at the pmd and pte
+(last) level. So this type of HugeTLB page can be optimized only when its
+size of the struct page structs is greater than 1 page.
+
+Notice: The head vmemmap page is not freed to the buddy allocator and all
+tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+associated with each HugeTLB page. The compound_head() can handle this
+correctly (more details refer to the comment above compound_head()).
+
+Device DAX
+==========
+
+The device-dax interface uses the same tail deduplication technique explained
+in the previous chapter, except when used with the vmemmap in
+the device (altmap).
+
+The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
+PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+
+The differences with HugeTLB are relatively minor.
+
+It only use 3 page structs for storing all information as opposed
+to 4 on HugeTLB pages.
+
+There's no remapping of vmemmap given that device-dax memory is not part of
+System RAM ranges initialized at boot. Thus the tail page deduplication
+happens at a later stage when we populate the sections. HugeTLB reuses the
+the head vmemmap page representing, whereas device-dax reuses the tail
+vmemmap page. This results in only half of the savings compared to HugeTLB.
+
+Deduplicated tail pages are not mapped read-only.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PMD    |                     +-----------+                       | | |
+ |   level   |                     |     5     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     6     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     7     | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
diff --git a/Documentation/mm/z3fold.rst b/Documentation/mm/z3fold.rst
new file mode 100644
index 000000000000..224e3c61d686
--- /dev/null
+++ b/Documentation/mm/z3fold.rst
@@ -0,0 +1,30 @@
+.. _z3fold:
+
+======
+z3fold
+======
+
+z3fold is a special purpose allocator for storing compressed pages.
+It is designed to store up to three compressed pages per physical page.
+It is a zbud derivative which allows for higher compression
+ratio keeping the simplicity and determinism of its predecessor.
+
+The main differences between z3fold and zbud are:
+
+* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
+* z3fold can hold up to 3 compressed pages in its page
+* z3fold doesn't export any API itself and is thus intended to be used
+  via the zpool API.
+
+To keep the determinism and simplicity, z3fold, just like zbud, always
+stores an integral number of compressed pages per page, but it can store
+up to 3 pages unlike zbud which can store at most 2. Therefore the
+compression ratio goes to around 2.7x while zbud's one is around 1.7x.
+
+Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
+return a dereferenceable pointer. Instead, it returns an unsigned long
+handle which encodes actual location of the allocated object.
+
+Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
+depend on MMU enabled and provides more predictable reclaim behavior
+which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst
new file mode 100644
index 000000000000..6e79893d6132
--- /dev/null
+++ b/Documentation/mm/zsmalloc.rst
@@ -0,0 +1,82 @@
+.. _zsmalloc:
+
+========
+zsmalloc
+========
+
+This allocator is designed for use with zram. Thus, the allocator is
+supposed to work well under low memory conditions. In particular, it
+never attempts higher order page allocation which is very likely to
+fail under memory pressure. On the other hand, if we just use single
+(0-order) pages, it would suffer from very high fragmentation --
+any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+This was one of the major issues with its predecessor (xvmalloc).
+
+To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+and links them together using various 'struct page' fields. These linked
+pages act as a single higher-order page i.e. an object can span 0-order
+page boundaries. The code refers to these linked pages as a single entity
+called zspage.
+
+For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+since this satisfies the requirements of all its current users (in the
+worst case, page is incompressible and is thus stored "as-is" i.e. in
+uncompressed form). For allocation requests larger than this size, failure
+is returned (see zs_malloc).
+
+Additionally, zs_malloc() does not return a dereferenceable pointer.
+Instead, it returns an opaque handle (unsigned long) which encodes actual
+location of the allocated object. The reason for this indirection is that
+zsmalloc does not keep zspages permanently mapped since that would cause
+issues on 32-bit systems where the VA region for kernel space mappings
+is very small. So, before using the allocating memory, the object has to
+be mapped using zs_map_object() to get a usable pointer and subsequently
+unmapped using zs_unmap_object().
+
+stat
+====
+
+With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
+``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
+    ...
+    ...
+     9   176           0            1           186        129          8                4
+    10   192           1            0          2880       2872        135                3
+    11   208           0            1           819        795         42                2
+    12   224           0            1           219        159         12                4
+    ...
+    ...
+
+
+class
+	index
+size
+	object size zspage stores
+almost_empty
+	the number of ZS_ALMOST_EMPTY zspages(see below)
+almost_full
+	the number of ZS_ALMOST_FULL zspages(see below)
+obj_allocated
+	the number of objects allocated
+obj_used
+	the number of objects allocated to the user
+pages_used
+	the number of pages allocated for the class
+pages_per_zspage
+	the number of 0-order pages to make a zspage
+
+We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
+
+* n = number of allocated objects
+* N = total number of objects zspage can store
+* f = fullness_threshold_frac(ie, 4 at the moment)
+
+Similarly, we assign zspage to:
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
index 0c8276109fc0..30c69e1f44fe 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst
@@ -13,7 +13,7 @@
 监测数据访问
 ============
 
-:doc:`DAMON </vm/damon/index>` 允许轻量级的数据访问监测。使用DAMON，
+:doc:`DAMON </mm/damon/index>` 允许轻量级的数据访问监测。使用DAMON，
 用户可以分析他们系统的内存访问模式，并优化它们。
 
 .. toctree::
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
index 1500bdbf338a..c976f3e33ffd 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
@@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做，这样我们就可以退回到基于LRU列
 
 .. [1] https://research.google/pubs/pub48551/
 .. [2] https://lwn.net/Articles/787611/
-.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
+.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index eee0e8c5c368..cd41ada4fdad 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。
   口相同。这将在下一个LTS内核发布后被移除，所以用户应该转移到
   :ref:`sysfs interface <sysfs_interface>`。
 - *内核空间编程接口。*
-  :doc:`这 </vm/damon/api>` 这是为内核空间程序员准备的。使用它，用户可以通过为你编写内
+  :doc:`这 </mm/damon/api>` 这是为内核空间程序员准备的。使用它，用户可以通过为你编写内
   核空间的DAMON应用程序，最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。
-  详细情况请参考接口 :doc:`文件 </vm/damon/api>`。
+  详细情况请参考接口 :doc:`文件 </mm/damon/api>`。
 
 sysfs接口
 =========
@@ -148,7 +148,7 @@ contexts/<N>/monitoring_attrs/
 在 ``nr_regions`` 目录下，有两个文件分别用于DAMON监测区域的下限和上限（``min`` 和 ``max`` ），
 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。
 
-关于间隔和监测区域范围的更多细节，请参考设计文件 (:doc:`/vm/damon/design`)。
+关于间隔和监测区域范围的更多细节，请参考设计文件 (:doc:`/mm/damon/design`)。
 
 contexts/<N>/targets/
 ---------------------
@@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
 ----
 
 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔``
-以及监测目标区域的最小/最大数量。要详细了解监测属性，请参考 `:doc:/vm/damon/design` 。例如，
+以及监测目标区域的最小/最大数量。要详细了解监测属性，请参考 `:doc:/mm/damon/design` 。例如，
 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000，然后再次检查::
 
     # cd <debugfs>/damon
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 26d9913fc8b6..b03020c8b2ab 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -101,7 +101,7 @@ Todolist:
 ========
 
 如何在内核中分配和使用内存。请注意，在
-:doc:`/vm/index` 中有更多的内存管理文档。
+:doc:`/mm/index` 中有更多的内存管理文档。
 
 .. toctree::
    :maxdepth: 1
diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst
index ad7bb8c17562..bf85baca8b3e 100644
--- a/Documentation/translations/zh_CN/index.rst
+++ b/Documentation/translations/zh_CN/index.rst
@@ -118,7 +118,7 @@ TODOList:
    sound/index
    filesystems/index
    scheduler/index
-   vm/index
+   mm/index
    peci/index
 
 TODOList:
diff --git a/Documentation/translations/zh_CN/mm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst
new file mode 100644
index 000000000000..c2816f523bd7
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/active_mm.rst
@@ -0,0 +1,85 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/active_mm.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=========
+Active MM
+=========
+
+这是一封linux之父回复开发者的一封邮件，所以翻译时我尽量保持邮件格式的完整。
+
+::
+
+ List:       linux-kernel
+ Subject:    Re: active_mm
+ From:       Linus Torvalds <torvalds () transmeta ! com>
+ Date:       1999-07-30 21:36:24
+
+ 因为我并不经常写解释，所以已经抄送到linux-kernel邮件列表，而当我做这些，
+ 且更多的人在阅读它们时，我觉得棒极了。
+
+ 1999年7月30日 星期五， David Mosberger 写道：
+ >
+ > 是否有一个简短的描述，说明task_struct中的
+ >  "mm" 和 "active_mm"应该如何使用？ (如果
+ > 这个问题在邮件列表中讨论过，我表示歉意--我刚
+ > 刚度假回来，有一段时间没能关注linux-kernel了）。
+
+ 基本上，新的设定是：
+
+  - 我们有“真实地址空间”和“匿名地址空间”。区别在于，匿名地址空间根本不关心用
+    户级页表，所以当我们做上下文切换到匿名地址空间时，我们只是让以前的地址空间
+    处于活动状态。
+
+    一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线
+    程基本上都属于这一类，但即使是“真正的”线程也可以暂时说在一定时间内它们不
+    会对用户空间感兴趣，调度器不妨试着避免在切换VM状态上浪费时间。目前只有老
+    式的bdflush sync能做到这一点。
+
+  - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说，tsk->mm将是NULL，
+    其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。
+
+  - 然而，我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此，我们
+    有 “tsk->active_mm”，它显示了当前活动的地址空间是什么。
+
+    规则是，对于一个有真实地址空间的进程（即tsk->mm是 non-NULL），active_mm
+    显然必须与真实的mm相同。
+
+    对于一个匿名进程，tsk->mm == NULL，而tsk->active_mm是匿名进程运行时
+    “借用”的mm。当匿名进程被调度走时，借用的地址空间被返回并清除。
+
+ 为了支持所有这些，“struct mm_struct”现在有两个计数器：一个是 “mm_users”
+ 计数器，即有多少 “真正的地址空间用户”，另一个是 “mm_count”计数器，即 “lazy”
+ 用户（即匿名用户）的数量，如果有任何真正的用户，则加1。
+
+ 通常情况下，至少有一个真正的用户，但也可能是真正的用户在另一个CPU上退出，而
+ 一个lazy的用户仍在活动，所以你实际上得到的情况是，你有一个地址空间 **只**
+ 被lazy的用户使用。这通常是一个短暂的生命周期状态，因为一旦这个线程被安排给一
+ 个真正的线程，这个 “僵尸” mm就会被释放，因为 “mm_count”变成了零。
+
+ 另外，一个新的规则是，**没有人** 再把 “init_mm” 作为一个真正的MM了。
+ “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”，事实上，它主
+ 要是在启动时使用，当时还没有真正的VM被创建。因此，用来检查的代码
+
+   if (current->mm == &init_mm)
+
+ 一般来说，应该用
+
+   if (!current->mm)
+
+ 取代上面的写法（这更有意义--测试基本上是 “我们是否有一个用户环境”，并且通常
+ 由缺页异常处理程序和类似的东西来完成）。
+
+ 总之，我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1，因为它稍微改
+ 变了接口以适配alpha（谁会想到呢，但alpha体系结构上下文切换代码实际上最终是
+ 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的，alpha的PALcode将两者
+ 连接起来，你需要同时切换两者）。
+
+ (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst
new file mode 100644
index 000000000000..6fd79209c307
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/balance.rst
@@ -0,0 +1,81 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/balance.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+内存平衡
+========
+
+2000年1月开始，作者：Kanoj Sarcar <kanoj@sgi.com>
+
+对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配，需要进行
+内存平衡。
+
+调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个
+原因可能是，调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退
+选项的机会主义高阶分配请求中。在这种情况下，调用者可能也希望避免唤醒kswapd。
+
+__GFP_IO分配请求是为了防止文件系统死锁。
+
+在没有非睡眠分配请求的情况下，做平衡似乎是有害的。页面回收可以被懒散地启动，也就是
+说，只有在需要的时候（也就是区域的空闲内存为0），而不是让它成为一个主动的过程。
+
+也就是说，内核应该尝试从直接映射池中满足对直接映射页的请求，而不是回退到dma池中，
+这样就可以保持dma池为dma请求（不管是不是原子的）所填充。类似的争论也适用于高内存
+和直接映射的页面。相反，如果有很多空闲的dma页，最好是通过从dma池中分配一个来满足
+常规的内存请求，而不是产生常规区域平衡的开销。
+
+在2.2中，只有当空闲页总数低于总内存的1/64时，才会启动内存平衡/页面回收。如果dma
+和常规内存的比例合适，即使dma区完全空了，也很可能不会进行平衡。2.2已经在不同内存
+大小的生产机器上运行，即使有这个问题存在，似乎也做得不错。在2.3中，由于HIGHMEM的
+存在，这个问题变得更加严重。
+
+在2.3中，区域平衡可以用两种方式之一来完成：根据区域的大小（可能是低级区域的大小），
+我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是，在平衡的时
+候，我们不需要看低级区的大小，坏的方面是，我们可能会因为忽略低级区可能较低的使用率
+而做过于频繁的平衡。另外，只要对分配程序稍作修改，就有可能将memclass()宏简化为一
+个简单的等式。
+
+另一个可能的解决方案是，我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其
+低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题，并尽可能地保持了与2.2行为
+的接近。另外，平衡算法在各种架构上的工作方式也是一样的，这些架构有不同数量和类型的
+内存区。如果我们想变得更花哨一点，我们可以在未来为不同区域的自由页面分配不同的权重。
+
+请注意，如果普通区的大小与dma区相比是巨大的，那么在决定是否平衡普通区的时候，考虑
+空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。
+
+所附的补丁实现了第二个解决方案。它还 “修复”了两个问题：首先，在低内存条件下，kswapd
+被唤醒，就像2.2中的非睡眠分配。第二，HIGHMEM区也被平衡了，以便给replace_with_highmem()
+一个争取获得HIGHMEM页的机会，同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM
+页不会被泄露（例如，在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下）。
+
+kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的，可能
+是因为所有的分配请求都来自中断上下文，而所有的进程上下文都在睡眠。对于2.3，
+kswapd并不真正需要平衡高内存区，因为中断上下文并不请求高内存页。kswapd看zone
+结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。
+
+如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力，而该区的内存压力
+已经低于其水位，则会进行偷取。
+
+watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd：
+这些是每个区的字段，用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时，
+hysteric 的字段low_on_memory被设置。这个字段会一直被设置，直到空闲页数变成水位
+[WMARK_HIGH]。当low_on_memory被设置时，页面分配请求将尝试释放该区域的一些页面（如果
+请求中设置了GFP_WAIT）。与此相反的是，决定唤醒kswapd以释放一些区的页。这个决定不是基于
+hysteresis 的，而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行；在这种情况下，
+zone_wake_kswapd也被设置。
+
+
+我所听到的（超棒的）想法：
+
+1. 动态经历应该影响平衡：可以跟踪一个区的失败请求的数量，并反馈到平衡方案中（jalvo@mbay.net）。
+
+2. 实现一个类似于replace_with_highmem()的replace_with_regular()，以保留dma页面。
+   (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/translations/zh_CN/mm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst
new file mode 100644
index 000000000000..5593a83c86bc
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/api.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/api.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=======
+API参考
+=======
+
+内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` ，
+它位于源代码树的include/linux/。
+
+结构体
+======
+
+该API在以下内核代码中:
+
+include/linux/damon.h
+
+
+函数
+====
+
+该API在以下内核代码中:
+
+mm/damon/core.c
diff --git a/Documentation/translations/zh_CN/mm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst
new file mode 100644
index 000000000000..16e3db34a7dd
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/design.rst
@@ -0,0 +1,140 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/design.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+====
+设计
+====
+
+可配置的层
+==========
+
+DAMON提供了数据访问监控功能，同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间
+并为之优化的基元。另一方面，作为DAMON的核心，准确性和开销的权衡机制是在纯逻辑空间中。DAMON
+将这两部分分离在不同的层中，并定义了它的接口，以允许各种低层次的基元实现与核心逻辑的配置。
+
+由于这种分离的设计和可配置的接口，用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的
+任何地址空间。如果没有提供合适的，用户可以自己实现基元。
+
+例如，物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。
+另外，如果某些架构或设备支持特殊的优化访问检查基元，这些基元将很容易被配置。
+
+
+特定地址空间基元的参考实现
+==========================
+
+基本访问监测的低级基元被定义为两部分。:
+
+1. 确定地址空间的监测目标地址范围
+2. 目标空间中特定地址范围的访问检查。
+
+DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。
+
+
+基于VMA的目标地址范围构造
+-------------------------
+
+这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间，只是要求用户手动设置监控目标地址范围。
+
+在进程的超级巨大的虚拟地址空间中，只有小部分被映射到物理内存并被访问。因此，跟踪未映射的地
+址区域只是一种浪费。然而，由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声，所以严
+格来说，跟踪每一个映射并不是必须的，但在某些情况下甚至会产生很高的开销。也就是说，监测目标
+内部过于巨大的未映射区域应该被移除，以不占用自适应机制的时间。
+
+出于这个原因，这个实现将复杂的映射转换为三个不同的区域，覆盖地址空间的每个映射区域。这三个
+区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面
+的mmap()区域之间的间隙，以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙
+在通常的地址空间中是异常巨大的，排除这些间隙就足以做出合理的权衡。下面详细说明了这一点::
+
+    <heap>
+    <BIG UNMAPPED REGION 1>
+    <uppermost mmap()-ed region>
+    (small mmap()-ed regions and munmap()-ed regions)
+    <lowermost mmap()-ed region>
+    <BIG UNMAPPED REGION 2>
+    <stack>
+
+
+基于PTE访问位的访问检查
+-----------------------
+
+物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中
+找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表，而物理地址的实现则
+是查找与该地址有映射关系的每一个页表。通过这种方式，实现者找到并清除下一个采样目标地址的位，
+并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统，即空闲页跟
+踪和回收逻辑。为了避免这种干扰，DAMON使其与空闲页面跟踪相互排斥，并使用 ``PG_idle`` 和
+``PG_young`` 页面标志来解决与回收逻辑的冲突，就像空闲页面跟踪那样。
+
+
+独立于地址空间的核心机制
+========================
+
+下面四个部分分别描述了DAMON的核心机制和五个监测属性，即 ``采样间隔`` 、 ``聚集间隔`` 、
+``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。
+
+
+访问频率监测
+------------
+
+DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置
+``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说，DAMON检查每个 ``采样间隔`` 对每
+个页面的访问，并将结果汇总。换句话说，计算每个页面的访问次数。在每个 ``聚合间隔`` 过
+去后，DAMON调用先前由用户注册的回调函数，以便用户可以阅读聚合的结果，然后再清除这些结
+果。这可以用以下简单的伪代码来描述::
+
+    while monitoring_on:
+        for page in monitoring_target:
+            if accessed(page):
+                nr_accesses[page] += 1
+        if time() % aggregation_interval == 0:
+            for callback in user_registered_callbacks:
+                callback(monitoring_target, nr_accesses)
+            for page in monitoring_target:
+                nr_accesses[page] = 0
+        sleep(sampling interval)
+
+这种机制的监测开销将随着目标工作负载规模的增长而任意增加。
+
+
+基于区域的抽样调查
+------------------
+
+为了避免开销的无限制增加，DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持
+这个假设（一个区域内的页面具有相同的访问频率），该区域内就只需要检查一个页面。因此，对
+于每个 ``采样间隔`` ，DAMON在每个区域中随机挑选一个页面，等待一个 ``采样间隔`` ，检
+查该页面是否同时被访问，如果被访问则增加该区域的访问频率。因此，监测开销是可以通过设置
+区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。
+
+然而，如果假设没有得到保证，这个方案就不能保持输出的质量。
+
+
+适应性区域调整
+--------------
+
+即使最初的监测目标区域被很好地构建以满足假设（同一区域内的页面具有相似的访问频率），数
+据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设，DAMON根据每个
+区域的访问频率自适应地进行合并和拆分。
+
+对于每个 ``聚集区间`` ，它比较相邻区域的访问频率，如果频率差异较小，就合并这些区域。
+然后，在它报告并清除每个区域的聚合接入频率后，如果区域总数不超过用户指定的最大区域数，
+它将每个区域拆分为两个或三个区域。
+
+通过这种方式，DAMON提供了其最佳的质量和最小的开销，同时保持了用户为其权衡设定的界限。
+
+
+动态目标空间更新处理
+--------------------
+
+监测目标地址范围可以动态改变。例如，虚拟内存可以动态地被映射和解映射。物理内存可以被
+热插拔。
+
+由于在某些情况下变化可能相当频繁，DAMON允许监控操作检查动态变化，包括内存映射变化，
+并仅在用户指定的时间间隔（ ``更新间隔`` ）中的每个时间段，将其应用于监控操作相关的
+数据结构，如抽象的监控目标内存区。
\ No newline at end of file
diff --git a/Documentation/translations/zh_CN/mm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst
new file mode 100644
index 000000000000..de4be417494a
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/faq.rst
@@ -0,0 +1,48 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/faq.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+常见问题
+========
+
+为什么是一个新的子系统，而不是扩展perf或其他用户空间工具？
+==========================================================
+
+首先，因为它需要尽可能的轻量级，以便可以在线使用，所以应该避免任何不必要的开销，如内核-用户
+空间的上下文切换成本。第二，DAMON的目标是被包括内核在内的其他程序所使用。因此，对特定工具
+（如perf）的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。
+
+
+“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗？
+==============================================
+
+闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的，尽管它可以
+使用采样来减少开销。另一方面，DAMON是一个更高层次的框架，用于监控各种地址空间。它专注于内
+存管理优化，并提供复杂的精度/开销处理机制。因此，“空闲页面跟踪” 和 “perf mem” 可以提供
+DAMON输出的一个子集，但不能替代DAMON。
+
+
+DAMON是否只支持虚拟内存？
+=========================
+
+不，DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始
+部分，包括监测目标区域的构造和实际的访问检查。通过这种方式，DAMON用户可以用任何访问检查技
+术来监测任何地址空间。
+
+尽管如此，DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间
+相关功能的实现，以供参考和方便使用。
+
+
+我可以简单地监测页面的粒度吗？
+==============================
+
+是的，你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。
+因为监视目标区域的大小被强制为 ``>=page size`` ，所以区域分割不会产生任何影响。
diff --git a/Documentation/translations/zh_CN/mm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst
new file mode 100644
index 000000000000..b03bf307204f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/damon/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/damon/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+==========================
+DAMON:数据访问监视器
+==========================
+
+DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为
+（该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)）
+
+ - *准确度* （监测输出对DRAM级别的内存管理足够有用；但可能不适合CPU Cache级别），
+ - *轻量级* （监控开销低到可以在线应用），以及
+ - *可扩展* （无论目标工作负载的大小，开销的上限值都在恒定范围内）。
+
+因此，利用这个框架，内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实
+验性内存管理优化工作可以再次进行。同时，在用户空间，有一些特殊工作负载的用户可以编写
+个性化的应用程序，以便更好地了解和优化他们的工作负载和系统。
+
+.. toctree::
+   :maxdepth: 2
+
+   faq
+   design
+   api
diff --git a/Documentation/translations/zh_CN/mm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst
new file mode 100644
index 000000000000..83b14cce9adf
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst
@@ -0,0 +1,38 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/_free_page_reporting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==========
+空闲页报告
+==========
+
+空闲页报告是一个API，设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟
+化的情况下是很有用的，客户机能够使用这些数据来通知管理器它不再使用内存中的某些页
+面。
+
+对于驱动，通常是气球驱动要使用这个功能，它将分配和初始化一个page_reporting_dev_info
+结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必
+须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。
+假设没有其他页面报告设备已经注册， 对page_reporting_register的调用将向报告框
+架注册页面报告接口。
+
+一旦注册，页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告
+页面，并在任何足够高的页面被释放之后2秒继续报告。
+
+报告的页面将被存储在传递给报告函数的散列表中，最后一个条目的结束位被设置在条目
+nent-1中。 当页面被报告函数处理时，分配器将无法访问它们。一旦报告函数完成，这些
+页将被返回到它们所获得的自由区域。
+
+在移除使用空闲页报告的驱动之前，有必要调用page_reporting_unregister，以移除
+目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报
+告通过该接口发出。如果另一个驱动或同一驱动被注册，它就有可能恢复前一个驱动在报告
+空闲页方面的工作。
+
+
+Alexander Duyck, 2019年12月04日
diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst
new file mode 100644
index 000000000000..5c18ea2be04f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/frontswap.rst
@@ -0,0 +1,196 @@
+:Original: Documentation/mm/_free_page_reporting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=========
+Frontswap
+=========
+
+Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中，由
+于交换页被保存在RAM（或类似RAM的设备）中，而不是交换磁盘，因此可以获得巨大的性能
+节省（提高）。
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap之所以这么命名，是因为它可以被认为是与swap设备的“back”存储相反。存
+储器被认为是一个同步并发安全的面向页面的“伪RAM设备”，符合transcendent memory
+（如Xen的“tmem”，或内核内压缩内存，又称“zcache”，或未来的类似RAM的设备）的要
+求；这个伪RAM设备不能被内核直接访问或寻址，其大小未知且可能随时间变化。驱动程序通过
+调用frontswap_register_ops将自己与frontswap链接起来，以适当地设置frontswap_ops
+的功能，它提供的功能必须符合某些策略，如下所示:
+
+一个 “init” 将设备准备好接收与指定的交换设备编号（又称“类型”）相关的frontswap
+交换页。一个 “store” 将把该页复制到transcendent memory，并与该页的类型和偏移
+量相关联。一个 “load” 将把该页，如果找到的话，从transcendent memory复制到内核
+内存，但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从
+transcendent memory中删除该页，一个 “invalidate_area” 将删除所有与交换类型
+相关的页（例如，像swapoff）并通知 “device” 拒绝进一步存储该交换类型。
+
+一旦一个页面被成功存储，在该页面上的匹配加载通常会成功。因此，当内核发现自己处于需
+要交换页面的情况时，它首先尝试使用frontswap。如果存储的结果是成功的，那么数据就已
+经成功的保存到了transcendent memory中，并且避免了磁盘写入，如果后来再读回数据，
+也避免了磁盘读取。如果存储返回失败，transcendent memory已经拒绝了该数据，且该页
+可以像往常一样被写入交换空间。
+
+请注意，如果一个页面被存储，而该页面已经存在于transcendent memory中（一个 “重复”
+的存储），要么存储成功，数据被覆盖，要么存储失败，该页面被废止。这确保了旧的数据永远
+不会从frontswap中获得。
+
+如果配置正确，对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
+debugfs完成的。frontswap的有效性可以通过以下方式测量（在所有交换设备中）:
+
+``failed_stores``
+	有多少次存储的尝试是失败的
+
+``loads``
+	尝试了多少次加载（应该全部成功）
+
+``succ_stores``
+	有多少次存储的尝试是成功的
+
+``invalidates``
+	尝试了多少次作废
+
+后台实现可以提供额外的指标。
+
+经常问到的问题
+==============
+
+* 价值在哪里?
+
+当一个工作负载开始交换时，性能就会下降。Frontswap通过提供一个干净的、动态的接口来
+读取和写入交换页到 “transcendent memory”，从而大大增加了许多这样的工作负载的性
+能，否则内核是无法直接寻址的。当数据被转换为不同的形式和大小（比如压缩）或者被秘密
+移动（对于一些类似RAM的设备来说，这可能对写平衡很有用）时，这个接口是理想的。交换
+页（和被驱逐的页面缓存页）是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。
+
+Frontswap对内核的影响相当小，为各种系统配置中更动态、更灵活的RAM利用提供了巨大的
+灵活性：
+
+在单一内核的情况下，又称“zcache”，页面被压缩并存储在本地内存中，从而增加了可以安
+全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利
+用率。Benchmarks测试显示，当内存压力较低时，几乎没有影响，而在高内存压力下的一些
+工作负载上，则有明显的性能改善（25%以上）。
+
+“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory
+的支持。Frontswap页面像zcache一样被本地压缩，但随后被“remotified” 到另一个系
+统的RAM。这使得RAM可以根据需要动态地来回负载平衡，也就是说，当系统A超载时，它可以
+交换到系统B，反之亦然。RAMster也可以被配置成一个内存服务器，因此集群中的许多服务器
+可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户
+有多少内存可用
+
+在虚拟情况下，虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复
+用。对于RAM来说，这真的很难做到，而且在不改变内核的情况下，要做好这一点的努力基本上
+是失败的（除了一些广为人知的特殊情况下的工作负载）。具体来说，Xen Transcendent Memory
+后端允许管理器拥有的RAM “fallow”，不仅可以在多个虚拟机之间进行“time-shared”，
+而且页面可以被压缩和重复利用，以优化RAM的利用率。当客户操作系统被诱导交出未充分利用
+的RAM时（如 “selfballooning”），突然出现的意外内存压力可能会导致交换；frontswap
+允许这些页面被交换到管理器RAM中或从管理器RAM中交换（如果整体主机系统内存条件允许），
+从而减轻计划外交换可能带来的可怕的性能影响。
+
+一个KVM的实现正在进行中，并且已经被RFC'ed到lkml。而且，利用frontswap，对NVM作为
+内存扩展技术的调查也在进行中。
+
+* 当然，在某些情况下可能有性能上的优势，但frontswap的空间/时间开销是多少？
+
+如果 CONFIG_FRONTSWAP 被禁用，每个 frontswap 钩子都会编译成空，唯一的开销是每
+个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用，但没有
+frontswap的 “backend” 寄存器，每读或写一个交换页就会有一个额外的全局变量，而不
+是零。如果 CONFIG_FRONTSWAP 被启用，并且有一个frontswap的backend寄存器，并且
+后端每次 “store” 请求都失败（即尽管声称可能，但没有提供内存），CPU 的开销仍然可以
+忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前，系统很可能是 I/O 绑定
+的，无论如何使用一小部分的 CPU 都是不相关的。
+
+至于空间，如果CONFIG_FRONTSWAP被启用，并且有一个frontswap的backend注册，那么
+每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换
+页分配的8位（在2.6.34之前是16位）上增加的。(Hugh Dickins观察到，frontswap可能
+会偷取现有的8个比特，但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的
+非常大的交换盘（这很罕见），这是每32GB交换盘1MB开销。
+
+当交换页存储在transcendent memory中而不是写到磁盘上时，有一个副作用，即这可能会
+产生更多的内存压力，有可能超过其他的优点。一个backend，比如zcache，必须实现策略
+来仔细（但动态地）管理内存限制，以确保这种情况不会发生。
+
+* 好吧，那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何？
+
+我们假设在内核初始化过程中，一个frontswap 的 “backend” 已经注册了；这个注册表
+明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提
+供了多少内存是完全动态和随机的。
+
+每当一个交换设备被交换时，就会调用frontswap_init()，把交换设备的编号（又称“类
+型”）作为一个参数传给它。这就通知了frontswap，以期待 “store” 与该号码相关的交
+换页的尝试。
+
+每当交换子系统准备将一个页面写入交换设备时（参见swap_writepage()），就会调用
+frontswap_store。Frontswap与frontswap backend协商，如果backend说它没有空
+间，frontswap_store返回-1，内核就会照常把页换到交换设备上。注意，来自frontswap
+backend的响应对内核来说是不可预测的；它可能选择从不接受一个页面，可能接受每九个
+页面，也可能接受每一个页面。但是如果backend确实接受了一个页面，那么这个页面的数
+据已经被复制并与类型和偏移量相关联了，而且backend保证了数据的持久性。在这种情况
+下，frontswap在交换设备的“frontswap_map” 中设置了一个位，对应于交换设备上的
+页面偏移量，否则它就会将数据写入该设备。
+
+当交换子系统需要交换一个页面时（swap_readpage()），它首先调用frontswap_load()，
+检查frontswap_map，看这个页面是否早先被frontswap backend接受。如果是，该页
+的数据就会从frontswap后端填充，换入就完成了。如果不是，正常的交换代码将被执行，
+以便从真正的交换设备上获得这一页的数据。
+
+所以每次frontswap backend接受一个页面时，交换设备的读取和（可能）交换设备的写
+入都被 “frontswap backend store” 和（可能）“frontswap backend loads”
+所取代，这可能会快得多。
+
+* frontswap不能被配置为一个 “特殊的” 交换设备，它的优先级要高于任何真正的交换
+  设备（例如像zswap，或者可能是swap-over-nbd/NFS）？
+
+首先，现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次
+结构，但这将需要相当大的改变。即使它被重写，现有的交换子系统也使用了块I/O层，它
+假定交换设备是固定大小的，其中的任何页面都是可线性寻址的。Frontswap几乎没有触
+及现有的交换子系统，而是围绕着块I/O子系统的限制，提供了大量的灵活性和动态性。
+
+例如，frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend
+的定义至关重要，因为它赋予了backend完全动态的决定权。在zcache中，人们无法预
+先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝，而 “差” 本身也可
+以根据当前的内存限制动态地定义。
+
+此外，frontswap是完全同步的，而真正的交换设备，根据定义，是异步的，并且使用
+块I/O。块I/O层不仅是不必要的，而且可能进行 “优化”，这对面向RAM的设备来说是
+不合适的，包括将一些页面的写入延迟相当长的时间。同步是必须的，以确保后端的动
+态性，并避免棘手的竞争条件，这将不必要地大大增加frontswap和/或块I/O子系统的
+复杂性。也就是说，只有最初的 “store” 和 “load” 操作是需要同步的。一个独立
+的异步线程可以自由地操作由frontswap存储的页面。例如，RAMster中的 “remotification”
+线程使用标准的异步内核套接字，将压缩的frontswap页面移动到远程机器。同样，
+KVM的客户方实现可以进行客户内压缩，并使用 “batched” hypercalls。
+
+在虚拟化环境中，动态性允许管理程序（或主机操作系统）做“intelligent overcommit”。
+例如，它可以选择只接受页面，直到主机交换可能即将发生，然后强迫客户机做他们
+自己的交换。
+
+transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可
+能失败，所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此，
+frontswap必须作为每个交换设备的 “影子” 来实现，它有可能容纳交换设备可能
+容纳的每一个页面，也有可能根本不容纳任何页面。这意味着frontswap不能包含比
+swap设备总数更多的页面。例如，如果在某些安装上没有配置交换设备，frontswap
+就没有用。无交换设备的便携式设备仍然可以使用frontswap，但是这种设备的
+backend必须配置某种 “ghost” 交换设备，并确保它永远不会被使用。
+
+
+* 为什么会有这种关于 “重复存储” 的奇怪定义？如果一个页面以前被成功地存储过，
+  难道它不能总是被成功地覆盖吗？
+
+几乎总是可以的，不，有时不能。考虑一个例子，数据被压缩了，原来的4K页面被压
+缩到了1K。现在，有人试图用不可压缩的数据覆盖该页，因此会占用整个4K。但是
+backend没有更多的空间了。在这种情况下，这个存储必须被拒绝。每当frontswap
+拒绝一个会覆盖的存储时，它也必须使旧的数据作废，并确保它不再被访问。因为交
+换子系统会把新的数据写到读交换设备上，这是确保一致性的正确做法。
+
+* 为什么frontswap补丁会创建新的头文件swapfile.h？
+
+frontswap代码依赖于一些swap子系统内部的数据结构，这些数据结构多年来一直
+在静态和全局之间来回移动。这似乎是一个合理的妥协：将它们定义为全局，但在一
+个新的包含文件中声明它们，该文件不被包含swap.h的大量源文件所包含。
+
+Dan Magenheimer，最后更新于2012年4月9日
diff --git a/Documentation/translations/zh_CN/mm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst
new file mode 100644
index 000000000000..81202c65e000
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/highmem.rst
@@ -0,0 +1,128 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/highmem.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==========
+高内存处理
+==========
+
+作者: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+.. contents:: :local:
+
+高内存是什么？
+==============
+
+当物理内存的大小接近或超过虚拟内存的最大大小时，就会使用高内存（highmem）。在这一点上，内
+核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内
+存的临时映射。
+
+没有被永久映射覆盖的那部分（物理）内存就是我们所说的 "高内存"。对于这个边界的确切位置，有
+各种架构上的限制。
+
+例如，在i386架构中，我们选择将内核映射到每个进程的虚拟空间，这样我们就不必为内核的进入/退
+出付出全部的TLB作废代价。这意味着可用的虚拟内存空间（i386上为4GiB）必须在用户和内核空间之
+间进行划分。
+
+使用这种方法的架构的传统分配方式是3:1，3GiB用于用户空间，顶部的1GiB用于内核空间。::
+
+		+--------+ 0xffffffff
+		| Kernel |
+		+--------+ 0xc0000000
+		|        |
+		| User   |
+		|        |
+		+--------+ 0x00000000
+
+这意味着内核在任何时候最多可以映射1GiB的物理内存，但是由于我们需要虚拟地址空间来做其他事
+情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少（通常在~896MiB左右）。
+
+其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而，一些硬件（如一些ARM）在使
+用mm上下文标签时，其虚拟空间有限。
+
+
+临时虚拟映射
+============
+
+内核包含几种创建临时映射的方法。:
+
+* vmap().  这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization
+  来解除映射。
+
+* kmap().  这允许对单个页面进行短期映射。它需要synchronization，但在一定程度上被摊销。
+  当以嵌套方式使用时，它也很容易出现死锁，因此不建议在新代码中使用它。
+
+* kmap_atomic().  这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上，
+  它表现得很好，但发布任务因此被要求留在该CPU上直到它完成，以免其他任务取代它的映射。
+
+  kmap_atomic() 也可以由中断上下文使用，因为它不睡眠，而且调用者可能在调用kunmap_atomic()
+  之后才睡眠。
+
+  可以假设k[un]map_atomic()不会失败。
+
+
+使用kmap_atomic
+===============
+
+何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存（见__GFP_HIGHMEM）
+分配的页面的内容时，例如在页缓存中的页面，就会使用它。该API有两个函数，它们的使用方式与
+下面类似::
+
+	/* 找到感兴趣的页面。 */
+	struct page *page = find_get_page(mapping, offset);
+
+	/* 获得对该页内容的访问权。 */
+	void *vaddr = kmap_atomic(page);
+
+	/* 对该页的内容做一些处理。 */
+	memset(vaddr, 0, PAGE_SIZE);
+
+	/* 解除该页面的映射。 */
+	kunmap_atomic(vaddr);
+
+注意，kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。
+
+如果你需要映射两个页面，因为你想从一个页面复制到另一个页面，你需要保持kmap_atomic调用严
+格嵌套，如::
+
+	vaddr1 = kmap_atomic(page1);
+	vaddr2 = kmap_atomic(page2);
+
+	memcpy(vaddr1, vaddr2, PAGE_SIZE);
+
+	kunmap_atomic(vaddr2);
+	kunmap_atomic(vaddr1);
+
+
+临时映射的成本
+==============
+
+创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。
+
+如果CONFIG_HIGHMEM没有被设置，那么内核会尝试用一点计算来创建映射，将页面结构地址转换成
+指向页面内容的指针，而不是去捣鼓映射。在这种情况下，解映射操作可能是一个空操作。
+
+如果CONFIG_MMU没有被设置，那么就不可能有临时映射和高内存。在这种情况下，也将使用计算方法。
+
+
+i386 PAE
+========
+
+在某些情况下，i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果:
+
+* Linux需要为系统中的每个页面建立一个页帧结构，而且页帧需要驻在永久映射中，这意味着：
+
+* 你最多可以有896M/sizeof(struct page)页帧；由于页结构体是32字节的，所以最终会有
+  112G的页；然而，内核需要在内存中存储更多的页帧......
+
+* PAE使你的页表变大--这使系统变慢，因为更多的数据需要在TLB填充等方面被访问。一个好处
+  是，PAE有更多的PTE位，可以提供像NX和PAT这样的高级功能。
+
+一般的建议是，你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作
+量有用，但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。
diff --git a/Documentation/translations/zh_CN/mm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst
new file mode 100644
index 000000000000..5024a8a15516
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hmm.rst
@@ -0,0 +1,361 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/hmm.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==================
+异构内存管理 (HMM)
+==================
+
+提供基础设施和帮助程序以将非常规内存（设备内存，如板上 GPU 内存）集成到常规内核路径中，其
+基石是此类内存的专用struct page（请参阅本文档的第 5 至 7 节）。
+
+HMM 还为 SVM（共享虚拟内存）提供了可选的帮助程序，即允许设备透明地访问与 CPU 一致的程序
+地址，这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得
+必不可少，其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。
+
+本文档分为以下部分：在第一部分中，我揭示了与使用特定于设备的内存分配器相关的问题。在第二
+部分中，我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页
+表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后，
+最后一节介绍了一个新的迁移助手，它允许利用设备 DMA 引擎。
+
+.. contents:: :local:
+
+使用特定于设备的内存分配器的问题
+================================
+
+具有大量板载内存（几 GB）的设备（如 GPU）历来通过专用驱动程序特定 API 管理其内存。这会
+造成设备驱动程序分配和管理的内存与常规应用程序内存（私有匿名、共享内存或常规文件支持内存）
+之间的隔断。从这里开始，我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况：
+即，设备可以透明地使用任何应用程序内存区域。
+
+分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来
+看，程序中的所有内存对象并不平等，这使得依赖于广泛的库的大型程序变得复杂。
+
+具体来说，这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存（malloc、mmap
+私有、mmap 共享）和通过设备驱动程序 API 分配的内存之间复制对象（这仍然以 mmap 结束，
+但是是设备文件）。
+
+对于平面数据集（数组、网格、图像……），这并不难实现，但对于复杂数据集（列表、树……），
+很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错，
+而且由于数据集和地址的重复，程序更难调试。
+
+分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据，因此每个库可能
+不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响，并因为各种内存
+拷贝而浪费资源。
+
+复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出，并不是一个可行的选择。
+这将导致库入口点的组合爆炸。
+
+最后，随着高级语言结构（在 C++ 中，当然也在其他语言中）的进步，编译器现在有可能在没有程
+序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有
+其他模式，使用共享地址空间也更合理。
+
+
+I/O 总线、设备内存特性
+======================
+
+由于一些限制，I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本
+内存访问；甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下，它
+不是缓存一致的。
+
+如果我们只考虑 PCIE 总线，那么设备可以访问主内存（通常通过 IOMMU）并与 CPU 缓存一
+致。但是，它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟：CPU
+只能访问有限范围的设备内存，而不能对其执行原子操作。因此，从内核的角度来看，设备内存不
+能被视为与常规内存等同。
+
+另一个严重的因素是带宽有限（约 32GBytes/s，PCIE 4.0 和 16 通道）。这比最快的 GPU
+内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自
+己的内存时高一个数量级。
+
+一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制
+（OpenCAPI、CCIX）。它们主要允许 CPU 和设备之间的双向缓存一致性，并允许架构支持的所
+有原子操作。遗憾的是，并非所有平台都遵循这一趋势，并且一些主要架构没有针对这些问题的硬
+件解决方案。
+
+因此，为了使共享地址空间有意义，我们不仅必须允许设备访问任何内存，而且还必须允许任何内
+存在设备使用时迁移到设备内存（在迁移时阻止 CPU 访问）。
+
+
+共享地址空间和迁移
+==================
+
+HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间，因此对
+于进程地址空间中的任何有效主内存地址，相同的地址指向相同的物理内存。
+
+为了实现这一点，HMM 提供了一组帮助程序来填充设备页表，同时跟踪 CPU 页表更新。设备页表
+更新不像 CPU 页表更新那么容易。要更新设备页表，您必须分配一个缓冲区（或使用预先分配的
+缓冲区池）并在其中写入 GPU 特定命令以执行更新（取消映射、缓存失效和刷新等）。这不能通
+过所有设备的通用代码来完成。因此，为什么HMM提供了帮助器，在把硬件的具体细节留给设备驱
+动程序的同时，把一切可以考虑的因素都考虑进去了。
+
+HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存，它允许为设备内存的每个页面分配一个
+struct page。这些页面很特殊，因为 CPU 无法映射它们。然而，它们允许使用现有的迁移机
+制将主内存迁移到设备内存，从 CPU 的角度来看，一切看起来都像是换出到磁盘的页面。使用
+struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次，HMM 仅提供帮助程序，
+首先为设备内存热插拔新的 ZONE_DEVICE 内存，然后执行迁移。迁移内容和时间的策略决定留
+给设备驱动程序。
+
+请注意，任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如，当支持给定CPU
+地址 A 的页面从主内存页面迁移到设备页面时，对地址 A 的任何 CPU 访问都会触发缺页异常
+并启动向主内存的迁移。
+
+凭借这两个特性，HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步，而且还通
+过迁移设备正在使用的数据集部分来利用设备内存。
+
+
+地址空间镜像实现和API
+=====================
+
+地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中；HMM 有助于
+保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier
+开始::
+
+ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+				  struct mm_struct *mm, unsigned long start,
+				  unsigned long length,
+				  const struct mmu_interval_notifier_ops *ops);
+
+在 ops->invalidate() 回调期间，设备驱动程序必须对范围执行更新操作（将范围标记为只
+读，或完全取消映射等）。设备必须在驱动程序回调返回之前完成更新。
+
+当设备驱动程序想要填充一个虚拟地址范围时，它可以使用::
+
+  int hmm_range_fault(struct hmm_range *range);
+
+如果请求写访问，它将在丢失或只读条目上触发缺页异常（见下文）。缺页异常使用通用的 mm 缺
+页异常代码路径，就像 CPU 缺页异常一样。
+
+这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟
+范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。
+
+在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面，以保
+持事物正确同步。使用模式是::
+
+ int driver_populate_range(...)
+ {
+      struct hmm_range range;
+      ...
+
+      range.notifier = &interval_sub;
+      range.start = ...;
+      range.end = ...;
+      range.hmm_pfns = ...;
+
+      if (!mmget_not_zero(interval_sub->notifier.mm))
+          return -EFAULT;
+
+ again:
+      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+      mmap_read_lock(mm);
+      ret = hmm_range_fault(&range);
+      if (ret) {
+          mmap_read_unlock(mm);
+          if (ret == -EBUSY)
+                 goto again;
+          return ret;
+      }
+      mmap_read_unlock(mm);
+
+      take_lock(driver->update);
+      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+          release_lock(driver->update);
+          goto again;
+      }
+
+      /* Use pfns array content to update device page table,
+       * under the update lock */
+
+      release_lock(driver->update);
+      return 0;
+ }
+
+driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用
+mmu_interval_read_retry() 之前保持，以避免与并发 CPU 页表更新发生任何竞争。
+
+利用 default_flags 和 pfn_flags_mask
+====================================
+
+hmm_range 结构有 2 个字段，default_flags 和 pfn_flags_mask，它们指定整个范围
+的故障或快照策略，而不必为 pfns 数组中的每个条目设置它们。
+
+例如，如果设备驱动程序需要至少具有读取权限的范围的页面，它会设置::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = 0;
+
+并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。
+
+现在假设驱动程序想要做同样的事情，除了它想要拥有写权限的范围内的一页。现在驱动程序设
+置::
+
+    range->default_flags = HMM_PFN_REQ_FAULT;
+    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
+
+有了这个，HMM 将在至少读取（即有效）的所有页面中异常，并且对于地址
+== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限，即，如果
+CPU pte 没有设置写权限，那么HMM将调用handle_mm_fault()。
+
+hmm_range_fault 完成后，标志位被设置为页表的当前状态，即 HMM_PFN_VALID | 如果页
+面可写，将设置 HMM_PFN_WRITE。
+
+
+从核心内核的角度表示和管理设备内存
+==================================
+
+尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存
+的信息，HMM 将自身挂接到 mm 代码的各个位置，以处理对设备内存支持的地址的任何访问。
+事实证明，这最终复制了 struct page 的大部分字段，并且还需要更新许多内核代码路径才
+能理解这种新的内存类型。
+
+大多数内核代码路径从不尝试访问页面后面的内存，而只关心struct page的内容。正因为如此，
+HMM 切换到直接使用 struct page 用于设备内存，这使得大多数内核代码路径不知道差异。
+我们只需要确保没有人试图从 CPU 端映射这些页面。
+
+移入和移出设备内存
+==================
+
+由于 CPU 无法直接访问设备内存，因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存
+储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和
+migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。
+
+在将页面迁移到设备私有内存之前，需要创建特殊的设备私有 ``struct page`` 。这些将用
+作特殊的“交换”页表条目，以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。
+
+这些可以通过以下方式分配和释放::
+
+    struct resource *res;
+    struct dev_pagemap pagemap;
+
+    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
+                                  "name of driver resource");
+    pagemap.type = MEMORY_DEVICE_PRIVATE;
+    pagemap.range.start = res->start;
+    pagemap.range.end = res->end;
+    pagemap.nr_range = 1;
+    pagemap.ops = &device_devmem_ops;
+    memremap_pages(&pagemap, numa_node_id());
+
+    memunmap_pages(&pagemap);
+    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
+
+还有devm_request_free_mem_region(), devm_memremap_pages(),
+devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``.
+
+整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration <page_migration>`) ，
+但这些步骤分为设备驱动程序特定代码和共享公共代码:
+
+1. ``mmap_read_lock()``
+
+   设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup()，
+   因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。
+
+2. ``migrate_vma_setup(struct migrate_vma *args)``
+
+   设备驱动初始化了 ``struct migrate_vma`` 的字段，并将该指针传递给
+   migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。
+   例如，设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存，设置
+   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页
+   面。如果后者被设置， ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备
+   私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前，只有匿名的私有VMA
+   范围可以被迁移到系统内存和设备私有内存。
+
+   migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()``
+   和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表，使
+   其他设备的MMU无效，以便在 ``args->src`` 数组中填写要迁移的PFN。
+   ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` ，
+   其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE， ``owner`` 字段设置为传递给
+   migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无
+   效化回调，只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。
+
+
+   在遍历页表时，一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效
+   的  “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清
+   除它，而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被
+   ``lock_page()``锁定，与LRU隔离（如果系统内存和设备私有页不在LRU上），从进
+   程中取消映射，并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup()
+   还清除了 ``args->dst`` 数组。
+
+3. 设备驱动程序分配目标页面并将源页面复制到目标页面。
+
+   驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已
+   设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选
+   择跳过页面迁移。
+
+   然后，驱动程序分配一个设备私有 struct page 或一个系统内存页，用 ``lock_page()``
+   锁定该页，并将 ``dst`` 数组条目填入::
+
+     dst[i] = migrate_pfn(page_to_pfn(dpage));
+
+   现在驱动程序知道这个页面正在被迁移，它可以使设备私有 MMU 映射无效并将设备私有
+   内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失
+   效，因此设备驱动程序只需使其自己的 MMU 映射失效。
+
+   驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的
+   ``struct page`` 面，并将源页面复制到目标设备上，如果指针为 ``NULL`` ，意
+   味着源页面没有被填充到系统内存中，则清除目标设备的私有内存。
+
+4. ``migrate_vma_pages()``
+
+   这一步是实际“提交”迁移的地方。
+
+   如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页，这时新分配的页会被插
+   入到CPU的页表中。如果一个CPU线程在同一页面上发生异常，这可能会失败。然而，页
+   表被锁定，只有一个新页会被插入。如果它失去了竞争，设备驱动将看到
+   ``MIGRATE_PFN_MIGRATE`` 位被清除。
+
+   如果源页被锁定、隔离等，源 ``struct page`` 信息现在被复制到目标
+   ``struct page`` ，最终完成CPU端的迁移。
+
+5. 设备驱动为仍在迁移的页面更新设备MMU页表，回滚未迁移的页面。
+
+   如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置，设备驱动可以
+   更新设备MMU，如果 ``MIGRATE_PFN_WRITE`` 位被设置，则设置写启用位。
+
+6. ``migrate_vma_finalize()``
+
+   这一步用新页的页表项替换特殊的迁移页表项，并释放对源和目的 ``struct page``
+   的引用。
+
+7. ``mmap_read_unlock()``
+
+   现在可以释放锁了。
+
+独占访问存储器
+==============
+
+一些设备具有诸如原子PTE位的功能，可以用来实现对系统内存的原子访问。为了支持对一
+个共享的虚拟内存页的原子操作，这样的设备需要对该页的访问是排他的，而不是来自CPU
+的任何用户空间访问。  ``make_device_exclusive_range()`` 函数可以用来使一
+个内存范围不能从用户空间访问。
+
+这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
+导致一个异常，该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已
+经被MMU通知器改变，之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序
+放弃页面锁和页面引用为止，这时页面上的任何CPU异常都可以按所述进行。
+
+内存 cgroup (memcg) 和 rss 统计
+===============================
+
+目前，设备内存被视为 rss 计数器中的任何常规页面（如果设备页面用于匿名，则为匿名，
+如果设备页面用于文件支持页面，则为文件，如果设备页面用于共享内存，则为 shmem）。
+这是为了保持现有应用程序的故意选择，这些应用程序可能在不知情的情况下开始使用设备
+内存，运行不受影响。
+
+一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序，
+因此不会释放太多系统内存。在决定以不同方式计算设备内存之前，我们希望收集更多关
+于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。
+
+对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算，常规
+页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规
+内存不会失败，因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对
+内存资源控制的影响有了更多的了解，我们可能会在后面重新考虑这个选择。
+
+请注意，设备内存永远不能由设备驱动程序或通过 GUP 固定，因此此类内存在进程退出时
+总是被释放的。或者在共享内存或文件支持内存的情况下，当删除最后一个引用时。
diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..752e5696cd47
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst
@@ -0,0 +1,436 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/hugetlbfs_reserv.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==============
+Hugetlbfs 预留
+==============
+
+概述
+====
+
+:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指
+示要使用巨页，这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常
+时没有巨页存在，任务就会被发送一个SIGBUS，并经常不高兴地死去。在加入巨页支
+持后不久，人们决定，在mmap()时检测巨页的短缺情况会更好。这个想法是，如果
+没有足够的巨页来覆盖映射，mmap()将失败。这首先是在mmap()时在代码中做一个
+简单的检查，以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一
+样，代码随着时间的推移而不断发展。然而，基本的想法是在mmap()时 “预留”
+巨页，以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核
+中是如何进行巨页预留处理的。
+
+
+读者
+====
+这个描述主要是针对正在修改hugetlbfs代码的内核开发者。
+
+
+数据结构
+========
+
+resv_huge_pages
+	这是一个全局的（per-hstate）预留的巨页的计数。预留的巨页只对预留它们的任
+	务可用。因此，一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。
+Reserve Map
+	预留映射由以下结构体描述::
+
+		struct resv_map {
+			struct kref refs;
+			spinlock_t lock;
+			struct list_head regions;
+			long adds_in_progress;
+			struct list_head region_cache;
+			long region_cache_count;
+		};
+
+	系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的
+	区域。一个区域被描述为::
+
+		struct file_region {
+			struct list_head link;
+			long from;
+			long to;
+		};
+
+	file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型，在
+	reserv_map 中的一个区域可能表示该范围存在预留，或预留不存在。
+Flags for MAP_PRIVATE Reservations
+	这些被存储在预留的映射指针的底部。
+
+	``#define HPAGE_RESV_OWNER    (1UL << 0)``
+		表示该任务是与该映射相关的预留的所有者。
+	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+		表示最初映射此范围（并创建储备）的任务由于COW失败而从该任务（子任务）中取消映
+		射了一个页面。
+Page Flags
+	PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在
+	“释放巨页” 一节中讨论。
+
+
+预留映射位置（私有或共享）
+==========================
+
+一个巨页映射或段要么是私有的，要么是共享的。如果是私有的，它通常只对一个地址空间
+（任务）可用。如果是共享的，它可以被映射到多个地址空间（任务）。对于这两种类型的映射，
+预留映射的位置和语义是明显不同的。位置的差异是：
+
+- 对于私有映射，预留映射挂在VMA结构体上。具体来说，就是vma->vm_private_data。这个保
+  留映射是在创建映射（mmap(MAP_PRIVATE)）时创建的。
+- 对于共享映射，预留映射挂在inode上。具体来说，就是inode->i_mapping->private_data。
+  由于共享映射总是由hugetlbfs文件系统中的文件支持，hugetlbfs代码确保每个节点包含一个预
+  留映射。因此，预留映射在创建节点时被分配。
+
+
+创建预留
+========
+当创建一个巨大的有页面支持的共享内存段（shmget(SHM_HUGETLB)）或通过mmap(MAP_HUGETLB)
+创建一个映射时，就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用::
+
+	int hugetlb_reserve_pages(struct inode *inode,
+				  long from, long to,
+				  struct vm_area_struct *vma,
+				  vm_flags_t vm_flags)
+
+hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE
+标志。如果指定了NORESERVE，那么这个函数立即返回，因为不需要预留。
+
+参数'from'和'to'是映射或基础文件的巨页索引。对于shmget()，'from'总是0，'to'对应于段/映射
+的长度。对于mmap()，offset参数可以用来指定进入底层文件的偏移量。在这种情况下，'from'和'to'
+参数已经被这个偏移量所调整。
+
+PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。
+
+- 对于共享映射，预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时，预留映射不被
+  修改。
+- 对于私有映射，预留映射中没有条目表示相应页面存在预留。随着预留被消耗，条目被添加到预留映射中。
+  因此，预留映射也可用于确定哪些预留已被消耗。
+
+对于私有映射，hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外，
+HPAGE_RESV_OWNER标志被设置，以表明该VMA拥有预留。
+
+预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射，这始终是一个值（to - from）。
+然而，对于共享映射来说，一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节，
+请参见 :ref:`预留映射的修改 <resv_map_modifications>` 一节。
+
+该映射可能与一个子池（subpool）相关联。如果是这样，将查询子池以确保有足够的空间用于映射。子池
+有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 <sub_pool_resv>`
+一节。
+
+在咨询了预留映射和子池之后，就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查
+并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而，在这
+些函数中，代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话，全局预留计数resv_huge_pages
+会被调整，如下所示::
+
+	if (resv_needed <= (resv_huge_pages - free_huge_pages))
+		resv_huge_pages += resv_needed;
+
+注意，在检查和调整这些计数器时，全局锁hugetlb_lock会被预留。
+
+如果有足够的空闲的巨页，并且全局计数resv_huge_pages被调整，那么与映射相关的预留映射被修改以
+反映预留。在共享映射的情况下，将存在一个file_region，包括'from'-'to'范围。对于私有映射，
+不对预留映射进行修改，因为没有条目表示存在预留。
+
+如果hugetlb_reserve_pages()成功，全局预留数和与映射相关的预留映射将根据需要被修改，以确保
+在'from'-'to'范围内存在预留。
+
+消耗预留/分配一个巨页
+===========================
+
+当与预留相关的巨页在相应的映射中被分配和实例化时，预留就被消耗了。该分配是在函数alloc_huge_page()
+中进行的::
+
+	struct page *alloc_huge_page(struct vm_area_struct *vma,
+				     unsigned long addr, int avoid_reserve)
+
+alloc_huge_page被传递给一个VMA指针和一个虚拟地址，因此它可以查阅预留映射以确定是否存在预留。
+此外，alloc_huge_page需要一个参数avoid_reserve，该参数表示即使看起来已经为指定的地址预留了
+预留，也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下，即现有页面的额
+外拷贝被分配。
+
+
+调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详
+细内容，请参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。从
+vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留，则为0，如果不存在预留，则为1。
+如果不存在预留，并且有一个与映射相关联的子池，则查询子池以确定它是否包含预留。如果子池包含预留，
+则可将其中一个用于该分配。然而，在任何情况下，avoid_reserve参数都会优先考虑为分配使用预留。在
+确定预留是否存在并可用于分配后，调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关
+的参数：
+
+- avoid_reserve，这是传递给alloc_huge_page()的同一个值/参数。
+- chg，尽管这个参数的类型是long，但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0，
+  则表明存在预留（关于可能的问题，请参见 “预留和内存策略” 一节）。如果值
+  为1，则表示不存在预留，如果可能的话，必须从全局空闲池中取出该页。
+
+与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面，当该页面从空闲列表中移除时，
+free_huge_pages的值被递减。如果有一个与该页相关的预留，将进行以下调整::
+
+	SetPagePrivate(page);	/* 表示分配这个页面消耗了一个预留，
+				 * 如果遇到错误，以至于必须释放这个页面，预留将被
+				 * 恢复。 */
+	resv_huge_pages--;	/* 减少全局预留计数 */
+
+注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
+的剩余巨页和超额分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整:
+SetPagePrivate(page) 和 resv_huge_pages--.
+
+在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
+面被释放时，这将被用于子池的计数。
+
+然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
+到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
+已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建一
+个新的条目。
+
+注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
+的剩余巨页和过度分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整。
+SetPagePrivate(page)和resv_huge_pages-。
+
+在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
+面被释放时，这将被用于子池的计数。
+
+然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
+到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
+已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建
+一个新的条目。
+
+在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用
+vma_commit_reservation()之间，预留映射有可能被改变。如果hugetlb_reserve_pages在共
+享映射中为同一页面被调用，这将是可能的。在这种情况下，预留计数和子池空闲页计数会有一个偏差。
+这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来
+识别。如果检测到这种竞争，子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息，请
+参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。
+
+
+实例化巨页
+==========
+
+在巨页分配之后，页面通常被添加到分配任务的页表中。在此之前，共享映射中的页面被添加到页面缓
+存中，私有映射中的页面被添加到匿名反向映射中。在这两种情况下，PagePrivate标志被清除。因此，
+当一个已经实例化的巨页被释放时，不会对全局预留计数（resv_huge_pages）进行调整。
+
+
+释放巨页
+========
+
+巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此，它只传
+递一个指向页面结构体的指针。当一个巨页被释放时，可能需要进行预留计算。如果该页与包含保
+留的子池相关联，或者该页在错误路径上被释放，必须恢复全局预留计数，就会出现这种情况。
+
+page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置，它表明全局预留计数
+应该被调整（关于如何设置这些标志的信息，请参见
+:ref: `消耗预留/分配一个巨页 <consume_resv>` ）。
+
+
+该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值（不等于
+传递的1的值），它表明预留与子池相关联，这个新释放的页面必须被用来保持子池预留的数量超过最小值。
+因此，在这种情况下，全局resv_huge_pages计数器被递增。
+
+如果页面中设置了PagePrivate标志，那么全局resv_huge_pages计数器将永远被递增。
+
+子池预留
+========
+
+有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一
+个hstate中的页面子集，它与一个已挂载的hugetlbfs文件系统相关
+
+当一个hugetlbfs文件系统被挂载时，可以指定min_size选项，它表示文件系统所需的最小的巨页数量。
+如果指定了这个选项，与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体
+hugepage_subpool的min_hpages字段中被跟踪。在挂载时，hugetlb_acct_memory(min_hpages)
+被调用以预留指定数量的巨页。如果它们不能被预留，挂载就会失败。
+
+当从子池中获取或释放页面时，会调用hugepage_subpool_get/put_pages()函数。
+hugepage_subpool_get/put_pages被传递给巨页数量，以此来调整子池的 “已用页面” 计数
+（get为下降，put为上升）。通常情况下，如果子池中没有足够的页面，它们会返回与传递的相同的值或
+一个错误。
+
+然而，如果预留与子池相关联，可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局
+池调整的数量。例如，假设一个子池包含3个预留的巨页，有人要求5个。与子池相关的3个预留页可以用来
+满足部分请求。但是，必须从全局池中获得2个页面。为了向调用者转达这一信息，将返回值2。然后，调用
+者要负责从全局池中获取另外两个页面。
+
+
+COW和预留
+==========
+
+由于共享映射都指向并使用相同的底层页面，COW最大的预留问题是私有映射。在这种情况下，两个任务可
+以指向同一个先前分配的页面。一个任务试图写到该页，所以必须分配一个新的页，以便每个任务都指向它
+自己的页。
+
+当该页最初被分配时，该页的预留被消耗了。当由于COW而试图分配一个新的页面时，有可能没有空闲的巨
+页，分配会失败。
+
+当最初创建私有映射时，通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。
+由于所有者创建了映射，所有者拥有与映射相关的所有预留。因此，当一个写异常发生并且没有可用的页面
+时，对预留的所有者和非所有者采取不同的行动。
+
+在发生异常的任务不是所有者的情况下，异常将失败，该任务通常会收到一个SIGBUS。
+
+如果所有者是发生异常的任务，我们希望它能够成功，因为它拥有原始的预留。为了达到这个目的，该页被
+从非所有者任务中解映射出来。这样一来，唯一的引用就是来自拥有者的任务。此外，HPAGE_RESV_UNMAPPED
+位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常，它可能
+会收到一个SIGBUS。但是，映射/预留的原始拥有者的行为将与预期一致。
+
+预留映射的修改
+==============
+
+以下低级函数用于对预留映射进行修改。通常情况下，这些函数不会被直接调用。而是调用一个预留映射辅
+助函数，该函数调用这些低级函数中的一个。这些低级函数在源代码（mm/hugetlb.c）中得到了相当好的
+记录。这些函数是::
+
+	long region_chg(struct resv_map *resv, long f, long t);
+	long region_add(struct resv_map *resv, long f, long t);
+	void region_abort(struct resv_map *resv, long f, long t);
+	long region_count(struct resv_map *resv, long f, long t);
+
+在预留映射上的操作通常涉及两个操作:
+
+1) region_chg()被调用来检查预留映射，并确定在指定的范围[f, t]内有多少页目前没有被代表。
+
+   调用代码执行全局检查和分配，以确定是否有足够的巨页使操作成功。
+
+2)
+  a) 如果操作能够成功，regi_add()将被调用，以实际修改先前传递给regi_chg()的相同范围
+     [f, t]的预留映射。
+  b) 如果操作不能成功，region_abort被调用，在相同的范围[f, t]内中止操作。
+
+注意，这是一个两步的过程， region_add()和 region_abort()在事先调用 region_chg()后保证
+成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作（特别是 region_add()）的
+成功。
+
+如上所述，region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加
+到映射中的范围内的页数。在大多数情况下， region_add() 的返回值与 region_chg() 的返回值相
+同。然而，在共享映射的情况下，有可能在调用 region_chg() 和 region_add() 之间对预留映射进
+行更改。在这种情况下，regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下，全局计数
+和子池计数很可能是不正确的，需要调整。检查这种情况并进行适当的调整是调用者的责任。
+
+函数region_del()被调用以从预留映射中移除区域。
+它通常在以下情况下被调用:
+
+- 当hugetlbfs文件系统中的一个文件被删除时，该节点将被释放，预留映射也被释放。在释放预留映射
+  之前，所有单独的file_region结构体必须被释放。在这种情况下，region_del的范围是[0, LONG_MAX]。
+- 当一个hugetlbfs文件正在被截断时。在这种情况下，所有在新文件大小之后分配的页面必须被释放。
+  此外，预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下，region_del
+  的范围是[new_end_of_file, LONG_MAX]。
+- 当在一个hugetlbfs文件中打洞时。在这种情况下，巨页被一次次从文件的中间移除。当这些页被移除
+  时，region_del()被调用以从预留映射中移除相应的条目。在这种情况下，region_del被传递的范
+  围是[page_idx, page_idx + 1]。
+
+在任何情况下，region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下，region_del()
+会失败。这只能发生在打洞的情况下，即它必须分割一个现有的file_region条目，而不能分配一个新的
+结构体。在这种错误情况下，region_del()将返回-ENOMEM。这里的问题是，预留映射将显示对该页有
+预留。然而，子池和全局预留计数将不反映该预留。为了处理这种情况，调用函数hugetlb_fix_reserve_counts()
+来调整计数器，使其与不能被删除的预留映射条目相对应。
+
+region_count()在解除私有巨页映射时被调用。在私有映射中，预留映射中没有条目表明存在一个预留。
+因此，通过计算预留映射中的条目数，我们知道有多少预留被消耗了，有多少预留是未完成的
+（Outstanding = (end - start) - region_count（resv, start, end））。由于映射正在消
+失，子池和全局预留计数被未完成的预留数量所减去。
+
+预留映射帮助函数
+================
+
+有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣，所以它们只是传入一个
+地址而不是一个范围。此外，它们还传入相关的VMA。从VMA中，可以确定映射的类型（私有或共享）和预留
+映射的位置（inode或VMA）。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而，
+它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义，并向调用者隐藏了这个细节::
+
+	long vma_needs_reservation(struct hstate *h,
+				   struct vm_area_struct *vma,
+				   unsigned long addr)
+
+该函数为指定的页面调用 region_chg()。如果不存在预留，则返回1。如果存在预留，则返回0::
+
+	long vma_commit_reservation(struct hstate *h,
+				    struct vm_area_struct *vma,
+				    unsigned long addr)
+
+这将调用 region_add()，用于指定的页面。与region_chg和region_add的情况一样，该函数应在
+先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加，它将
+返回1，如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出
+现意外的差异，说明在两次调用之间修改了预留映射::
+
+	void vma_end_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样，该函数应在
+先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作::
+
+	long vma_add_reservation(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long addr)
+
+这是一个特殊的包装函数，有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数
+中调用。该函数与vma_needs_reservation一起使用，试图将一个预留添加到预留映射中。它考虑到
+了私有和共享映射的不同预留映射语义。因此，region_add被调用用于共享映射（因为映射中的条目表
+示预留），而region_del被调用用于私有映射（因为映射中没有条目表示预留）。关于在错误路径上需
+要做什么的更多信息，请参见  “错误路径中的预留清理”  。
+
+
+错误路径中的预留清理
+====================
+
+正如在:ref:`预留映射帮助函数<resv_map_helpers>` 一节中提到的，预留的修改分两步进行。首
+先，在分配页面之前调用vma_needs_reservation。如果分配成功，则调用vma_commit_reservation。
+如果不是，则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整，
+一切都很好。
+
+此外，在一个巨页被实例化后，PagePrivate标志被清空，这样，当页面最终被释放时，计数是
+正确的。
+
+然而，有几种情况是，在一个巨页被分配后，但在它被实例化之前，就遇到了错误。在这种情况下，
+页面分配已经消耗了预留，并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放
+（在实例化和清除PagePrivate之前），那么free_huge_page将增加全局预留计数。然而，预留映射
+显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高，
+并阻止分配一个预先分配的页面。
+
+函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的
+是将预留映射恢复到页面分配前的状态。通过这种方式，预留映射的状态将与页面释放后的全局预留计
+数相对应。
+
+函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下，
+它将简单地清除该页的PagePrivate标志。这样一来，当页面被释放时，全局预留计数将不会被递增。
+然而，预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址，但它不会像最
+初设想的那样使用一个预留页。
+
+有一些代码（最明显的是userfaultfd）不能调用restore_reserve_on_error。在这种情况下，
+它简单地修改了PagePrivate，以便在释放巨页时不会泄露预留。
+
+
+预留和内存策略
+==============
+当git第一次被用来管理Linux代码时，每个节点的巨页列表就存在于hstate结构中。预留的概念是
+在一段时间后加入的。当预留被添加时，没有尝试将内存策略考虑在内。虽然cpusets与内存策略不
+完全相同，但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作
+用::
+
+
+	/*
+	 * 当cpuset被配置时，它打破了严格的hugetlb页面预留，因为计数是在一个全局变量上完
+	 * 成的。在有cpuset的情况下，这样的预留完全是垃圾，因为预留没有根据当前cpuset的
+	 * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时，应用程序仍然有可能
+	 * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的（或者说太难看了），因
+	 * 为cpuset太不稳定了，任务或内存节点可以在cpuset之间动态移动。与cpuset共享
+	 * hugetlb映射的语义变化是不可取的。然而，为了预留一些语义，我们退回到检查当前空闲
+	 * 页的可用性，作为一种最好的尝试，希望能将cpuset改变语义的影响降到最低。
+	 */
+
+添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败（OOM）。然而，如果一个应用
+程序使用cpusets或内存策略，就不能保证在所需的节点上有巨页可用。即使有足够数量的全局
+预留，也是如此。
+
+Hugetlbfs回归测试
+=================
+
+最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码，请使用
+libhugetlbfs测试套件来检查回归情况。此外，如果你添加了任何新的hugetlb功能，请在
+libhugetlbfs中添加适当的测试。
+
+--
+Mike Kravetz，2017年4月7日
diff --git a/Documentation/translations/zh_CN/mm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst
new file mode 100644
index 000000000000..310862edc937
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/hwpoison.rst
@@ -0,0 +1,166 @@
+
+:Original: Documentation/mm/hwpoison.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+hwpoison
+========
+
+什么是hwpoison?
+===============
+
+
+即将推出的英特尔CPU支持从一些内存错误中恢复（ ``MCA恢复`` ）。这需要操作系统宣布
+一个页面"poisoned"，杀死与之相关的进程，并避免在未来使用它。
+
+这个补丁包在虚拟机中实现了必要的(编程)框架。
+
+引用概述中的评论::
+
+	高级机器的检查与处理。处理方法是损坏的页面被硬件报告，通常是由于2位ECC内
+	存或高速缓存故障。
+
+	这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时，当前运行的进程
+	可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理，就可
+	以安全地忽略它. 而不是用另外一个机器检查去处理它。
+
+	处理不同状态的页面缓存页。这里棘手的部分是，相对于其他虚拟内存用户， 我们可以异
+	步访问任何页面。因为内存故障可能随时随地发生，可能违反了他们的一些假设。这就是
+	为什么这段代码必须非常小心。一般来说，它试图使用正常的锁规则，如获得标准锁，即使
+	这意味着错误处理可能需要很长的时间。
+
+	这里的一些操作有点低效，并且具有非线性的算法复杂性，因为数据结构没有针对这种情
+	况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的，所
+	以我们希望我们可以摆脱这种情况。
+
+该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的
+各种检查组成，用来处理poison的页面。
+
+现在主要目标是KVM客户机，但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm
+版本。
+
+对于KVM的使用，需要一个新的信号类型，这样KVM就可以用适当的地址将机器检查注入到客户
+机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是，所有的应用程序都不要这
+样做，但一些非常专业的应用程序可能会这样做。
+
+故障恢复模式
+============
+
+有两种（实际上是三种）模式的内存故障恢复可以在。
+
+vm.memory_failure_recovery sysctl 置零:
+	所有的内存故障都会导致panic。请不要尝试恢复。
+
+早期处理
+	(可以在全局和每个进程中控制) 一旦检测到错误，立即向应用程序发送SIGBUS这允许
+	应用程序以温和的方式处理内存错误（例如，放弃受影响的对象） 这是KVM qemu使用的
+	模式。
+
+推迟处理
+	当应用程序运行到损坏的页面时，发送SIGBUS。这对不知道内存错误的应用程序来说是
+	最好的，默认情况下注意一些页面总是被当作late kill处理。
+
+用户控制
+========
+
+vm.memory_failure_recovery
+	参阅 sysctl.txt
+
+vm.memory_failure_early_kill
+	全局启用early kill
+
+PR_MCE_KILL
+	设置early/late kill mode/revert 到系统默认值。
+
+	arg1: PR_MCE_KILL_CLEAR:
+		恢复到系统默认值
+	arg1: PR_MCE_KILL_SET:
+		arg2定义了线程特定模式
+
+		PR_MCE_KILL_EARLY:
+			Early kill
+		PR_MCE_KILL_LATE:
+			Late kill
+		PR_MCE_KILL_DEFAULT
+			使用系统全局默认值
+
+	注意，如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO)，你应该在
+	指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则，SIGBUS将被发送到主线程。
+
+PR_MCE_KILL_GET
+	返回当前模式
+
+测试
+====
+
+* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面
+
+* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块
+
+  corrupt-pfn
+	在PFN处注入hwpoison故障，并echoed到这个文件。这做了一些早期过滤，以避
+	免在测试套件中损坏非预期页面。
+  unpoison-pfn
+	在PFN的Software-unpoison页面对应到这个文件。这样，一个页面可以再次被
+	复用。这只对Linux注入的故障起作用，对真正的内存故障不起作用。
+
+  注意这些注入接口并不稳定，可能会在不同的内核版本中发生变化
+
+  corrupt-filter-dev-major, corrupt-filter-dev-minor
+	只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通
+	配符值。这应该只用于人工注入的测试。
+
+  corrupt-filter-memcg
+	限制注入到memgroup拥有的页面。由memcg的inode号指定。
+
+	Example::
+
+		mkdir /sys/fs/cgroup/mem/hwpoison
+
+	        usemem -m 100 -s 1000 &
+		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+		page-types -p `pidof init`   --hwpoison  # shall do nothing
+		page-types -p `pidof usemem` --hwpoison  # poison its pages
+
+  corrupt-filter-flags-mask, corrupt-filter-flags-value
+	当指定时，只有在((page_flags & mask) == value)的情况下才会poison页面。
+	这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相
+	同。这些标志位在include/linux/kernel-page-flags.h中定义，并在
+	Documentation/admin-guide/mm/pagemap.rst中记录。
+
+* 架构特定的MCE注入器
+
+  x86 有 mce-inject, mce-test
+
+  在mce-test中的一些便携式hwpoison测试程序，见下文。
+
+引用
+====
+
+http://halobates.de/mce-lc09-2.pdf
+	09年LinuxCon的概述演讲
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+	测试套件（在tsrc中的hwpoison特定可移植测试）。
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+	x86特定的注入器
+
+
+限制
+====
+- 不是所有的页面类型都被支持，而且永远不会。大多数内核内部对象不能被恢
+  复，目前只有LRU页。
+
+---
+Andi Kleen, 2009年10月
diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst
new file mode 100644
index 000000000000..4c8c6b7b72a3
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/index.rst
@@ -0,0 +1,54 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=================
+Linux内存管理文档
+=================
+
+这是一个关于Linux内存管理（mm）子系统内部的文档集，其中有不同层次的细节，包括注释
+和邮件列表的回复，用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建
+议，请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。
+对于控制和调整指南，请参阅(Documentation/admin-guide/mm/index)。
+TODO:待引用文档集被翻译完毕后请及时修改此处）
+
+.. toctree::
+   :maxdepth: 1
+
+   active_mm
+   balance
+   damon/index
+   free_page_reporting
+   highmem
+   ksm
+   frontswap
+   hmm
+   hwpoison
+   hugetlbfs_reserv
+   memory-model
+   mmu_notifier
+   numa
+   overcommit-accounting
+   page_frags
+   page_owner
+   page_table_check
+   remap_file_pages
+   split_page_table_lock
+   z3fold
+   zsmalloc
+
+TODOLIST:
+* arch_pgtable_helpers
+* free_page_reporting
+* hugetlbfs_reserv
+* page_migration
+* slub
+* transhuge
+* unevictable-lru
+* vmalloced-kernel-stacks
diff --git a/Documentation/translations/zh_CN/mm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst
new file mode 100644
index 000000000000..d1f82e857ad7
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/ksm.rst
@@ -0,0 +1,70 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/mm/ksm.rst
+
+:翻译:
+
+   徐鑫 xu xin <xu.xin16@zte.com.cn>
+
+============
+内核同页合并
+============
+
+KSM 是一种节省内存的数据去重功能，由CONFIG_KSM=y启用，并在2.6.32版本时被添加
+到Linux内核。详见 ``mm/ksm.c`` 的实现，以及http://lwn.net/Articles/306704和
+https://lwn.net/Articles/330589
+
+KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst
+文档中有描述。
+
+设计
+====
+
+概述
+----
+
+概述内容请见mm/ksm.c文档中的“DOC: Overview”
+
+逆映射
+------
+KSM维护着稳定树中的KSM页的逆映射信息。
+
+当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时，则代表了
+KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时，这个KSM页
+的 ``page->mapping`` 指向了该稳定树节点。
+
+如果共享数超过了阈值，KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多
+个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息，其中 ``page->mapping``
+指向该"副本"。
+
+每个链以及链接到该链中的所有"副本"强制不变的是，它们代表了相同的写保护内存
+内容，尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。
+
+这样一来，相比与无限的逆映射链表，稳定树的查找计算复杂性不受影响。但在稳定树
+本身中不能有重复的KSM页面内容仍然是强制要求。
+
+由 ``max_page_sharing`` 强制决定的数据去重限制是必要的，以此来避免虚拟内存
+rmap链表变得过大。rmap的遍历具有O(N)的复杂度，其中N是共享页面的rmap_项（即
+虚拟映射）的数量，而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。
+因此，这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进
+程在稳定节点"链"上的遍历也是O(N)，但这个N是稳定树"副本"的数量，而不是rmap项
+的数量，因此它对ksmd性能没有显著影响。实际上，最佳稳定树"副本"的候选节点将
+保留在"副本"列表的开头。
+
+``max_page_sharing`` 的值设置得高了会促使更快的内存合并（因为将有更少的稳定
+树副本排队进入稳定节点chain->hlist）和更高的数据去重系数，但代价是在交换、压
+缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。
+
+``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控
+的影响，高比值可能意味着稳定节点dup中存在碎片，这可以通过在ksmd中引入碎片算
+法来解决，该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup，以便释放
+那些仅包含极少rmap项的稳定节点"dup"，但这可能会增加ksmd进程的CPU使用率，并可
+能会减慢应用程序在KSM页面上的只读计算。
+
+KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本"，以便删减过时了的稳定节点。
+这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。
+
+参考
+====
+内核代码请见mm/ksm.c。
+涉及的函数(mm_slot  ksm_scan  stable_node  rmap_item)。
diff --git a/Documentation/translations/zh_CN/mm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst
new file mode 100644
index 000000000000..77ec149a970c
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/memory-model.rst
@@ -0,0 +1,135 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/memory-model.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+============
+物理内存模型
+============
+
+系统中的物理内存可以用不同的方式进行寻址。最简单的情况是，物理内存从地址0开
+始，跨越一个连续的范围，直到最大的地址。然而，这个范围可能包含CPU无法访问的
+小孔隙。那么，在完全不同的地址可能有几个连续的范围。而且，别忘了NUMA，即不
+同的内存库连接到不同的CPU。
+
+Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每
+个架构都定义了它所支持的内存模型，默认的内存模型是什么，以及是否有可能手动
+覆盖该默认值。
+
+所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页
+帧的状态。
+
+无论选择哪种内存模型，物理页框号（PFN）和相应的 `struct page` 之间都存
+在一对一的映射关系。
+
+每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn`
+帮助函数，允许从PFN到 `struct page` 的转换，反之亦然。
+
+FLATMEM
+=======
+
+最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的
+物理内存。
+
+在FLATMEM内存模型中，有一个全局的 `mem_map` 数组来映射整个物理内存。对
+于大多数架构，孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page`
+对象从未被完全初始化。
+
+为了分配 `mem_map` 数组，架构特定的设置代码应该调用free_area_init()函数。
+然而，在调用memblock_free_all()函数之前，映射数组是不能使用的，该函数
+将所有的内存交给页分配器。
+
+一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下，特
+定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。
+
+使用FLATMEM，PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET`
+是 `mem_map` 数组的一个索引。
+
+`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。
+
+SPARSEMEM
+=========
+
+SPARSEMEM是Linux中最通用的内存模型，它是唯一支持若干高级功能的内存模型，
+如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟
+初始化。
+
+SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构
+体表示，它包含 `section_mem_map` ，从逻辑上讲，它是一个指向 `struct page`
+阵列的指针。然而，它被存储在一些其他的magic中，以帮助分区管理。区段的大小
+和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量
+来指定的，这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS`
+是一个架构所支持的物理地址的实际宽度，而 `SECTION_SIZE_BITS` 是一个任
+意的值。
+
+最大的段数表示为 `NR_MEM_SECTIONS` ，定义为
+
+.. math::
+
+   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
+
+`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的
+大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数:
+
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时， `mem_sections` 数组是静态的，有
+  `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时， `mem_sections` 数组被动态分配。
+  每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象，行数的计算是为了适应所有的
+  内存区。
+
+架构设置代码应该调用sparse_init()来初始化内存区和内存映射。
+
+通过SPARSEMEM，有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和
+ "sparse vmemmap"。选择是在构建时进行的，它由 `CONFIG_SPARSEMEM_VMEMMAP` 的
+ 值决定。
+
+Classic sparse在page->flags中编码了一个页面的段号，并使用PFN的高位来访问映射该页
+框的段。在一个区段内，PFN是指向页数组的索引。
+
+Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操
+作。有一个全局的 `struct page *vmemmap` 指针，指向一个虚拟连续的 `struct page`
+对象阵列。PFN是该数组的一个索引，`struct page` 从 `vmemmap` 的偏移量是该页的PFN。
+
+为了使用vmemmap，一个架构必须保留一个虚拟地址的范围，以映射包含内存映射的物理页，并
+确保 `vmemmap`指向该范围。此外，架构应该实现 :c:func:`vmemmap_populate` 方法，
+它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求，
+它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。
+
+虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分
+配的存储中。这种存储用vmem_altmap结构表示，最终通过一长串的函数调用传递给
+vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和
+:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。
+
+ZONE_DEVICE
+===========
+`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上，为设备驱动识别的物理地址范
+围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下
+事实有关：这些地址范围的页面对象从未被在线标记过，而且必须对设备进行引用，而不仅仅
+是页面，以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ，通过 :c:func:`devm_memremap_pages` ，
+为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`，
+:c:func:`page_to_pfn`, ，和 :c:func:`get_user_pages` 服务。由于页面引
+用计数永远不会低于1，所以页面永远不会被追踪为空闲内存，页面的 `struct list_head lru`
+空间被重新利用，用于向映射该内存的主机设备/驱动程序进行反向引用。
+
+虽然 `SPARSEMEM` 将内存作为一个区段的集合，可以选择收集并合成内存块，但
+`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE`
+内存从未被在线标记，因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界
+上。这个实现依赖于这种缺乏用户接口的约束，允许子段大小的内存范围被指定给
+:c:func:`arch_add_memory` ，即内存热插拔的上半部分。子段支持允许2MB作为
+:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。
+
+`ZONE_DEVICE` 的用户是:
+
+* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。
+
+* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` ，
+  以允许设备驱动程序协调与设备内存相关的内存管理事件，通常是GPU内存。参见Documentation/mm/hmm.rst。
+
+* p2pdma: 创建 `struct page` 对象，允许PCI/E拓扑结构中的peer设备协调它们之间的
+  直接DMA操作，即绕过主机内存。
diff --git a/Documentation/translations/zh_CN/mm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst
new file mode 100644
index 000000000000..ce3664d1a410
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst
@@ -0,0 +1,97 @@
+:Original: Documentation/mm/mmu_notifier.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+什么时候需要页表锁内通知？
+==========================
+
+当清除一个pte/pmd时，我们可以选择通过在页表锁下（通知版的\*_clear_flush调用
+mmu_notifier_invalidate_range）通知事件。但这种通知并不是在所有情况下都需要的。
+
+对于二级TLB（非CPU TLB），如IOMMU TLB或设备TLB（当设备使用类似ATS/PASID的东西让
+IOMMU走CPU页表来访问进程的虚拟地址空间）。只有两种情况需要在清除pte/pmd时在持有页
+表锁的同时通知这些二级TLB：
+
+  A) 在mmu_notifier_invalidate_range_end()之前，支持页的地址被释放。
+  B) 一个页表项被更新以指向一个新的页面（COW，零页上的写异常，__replace_page()，...）。
+
+情况A很明显，你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。
+
+情况B更加微妙。为了正确起见，它需要按照以下序列发生:
+
+  - 上页表锁
+  - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify())
+  - 设置页表项以指向新页
+
+如果在设置新的pte/pmd值之前，清除页表项之后没有进行通知，那么你就会破坏设备的C11或
+C++11等内存模型。
+
+考虑以下情况（设备使用类似于ATS/PASID的功能）。
+
+两个地址addrA和addrB，这样|addrA - addrB| >= PAGE_SIZE，我们假设它们是COW的
+写保护（B的其他情况也适用）。
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0  {尝试写到addrA}
+ CPU-thread-1  {尝试写到addrB}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {读取addrA并填充设备TLB}
+ DEV-thread-2  {读取addrB并填充设备TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0  {COW_step1: {更新页表以指向addrA的新页}}
+ CPU-thread-1  {COW_step1: {更新页表以指向addrB的新页}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {写入addrA，这是对新页面的写入}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {preempted}
+ CPU-thread-2  {}
+ CPU-thread-3  {写入addrB，这是一个写入新页的过程}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {}
+ DEV-thread-2  {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0  {preempted}
+ CPU-thread-1  {}
+ CPU-thread-2  {}
+ CPU-thread-3  {}
+ DEV-thread-0  {从旧页中读取addrA}
+ DEV-thread-2  {从新页面读取addrB}
+
+所以在这里，因为在N+2的时候，清空页表项没有和通知一起作废二级TLB，设备在看到addrA的新值之前
+就看到了addrB的新值。这就破坏了设备的总内存序。
+
+当改变一个pte的写保护或指向一个新的具有相同内容的写保护页（KSM）时，将mmu_notifier_invalidate_range
+调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程
+在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占，也是如此。
diff --git a/Documentation/translations/zh_CN/mm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst
new file mode 100644
index 000000000000..b15cfeeb6dfb
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/numa.rst
@@ -0,0 +1,101 @@
+:Original: Documentation/mm/numa.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+始于1999年11月，作者： <kanoj@sgi.com>
+
+==========================
+何为非统一内存访问(NUMA)？
+==========================
+
+这个问题可以从几个视角来回答：硬件观点和Linux软件视角。
+
+从硬件角度看，NUMA系统是一个由多个组件或装配组成的计算机平台，每个组件可能包含0个或更多的CPU、
+本地内存和/或IO总线。为了简洁起见，并将这些物理组件/装配的硬件视角与软件抽象区分开来，我们在
+本文中称这些组件/装配为“单元”。
+
+每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能
+不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如，交叉开关或点对点
+链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来，以创建NUMA平台，其中的单元与其
+他单元有多个距离。
+
+对于Linux，感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中，
+所有的内存都是可见的，并且可以从连接到任何单元的任何CPU中访问，缓存一致性是由处理器缓存和/或
+系统互连在硬件中处理。
+
+内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元
+有多远。例如，连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和
+更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的（其他）单元。
+
+平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反，这种架构是提供可扩展
+内存带宽的一种手段。然而，为了实现可扩展的内存带宽，系统和应用软件必须安排大部分的内存引用
+[cache misses]到“本地”内存——同一单元的内存，如果有的话——或者到最近的有内存的单元。
+
+这就自然而然有了Linux软件对NUMA系统的视角:
+
+Linux将系统的硬件资源划分为多个软件抽象，称为“节点”。Linux将节点映射到硬件平台的物理单元
+上，对一些架构的细节进行了抽象。与物理单元一样，软件节点可能包含0或更多的CPU、内存和/或IO
+总线。同样，对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快
+的访问时间和更高的有效带宽。
+
+对于一些架构，如x86，Linux将“隐藏”任何代表没有内存连接的物理单元的节点，并将连接到该单元
+的任何CPU重新分配到代表有内存的单元的节点上。因此，在这些架构上，我们不能假设Linux将所有
+的CPU与一个给定的节点相关联，会看到相同的本地内存访问时间和带宽。
+
+此外，对于某些架构，同样以x86为例，Linux支持对额外节点的仿真。对于NUMA仿真，Linux会将现
+有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部
+分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的，当与cpusets一起使用时，
+可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+对于每个有内存的节点，Linux构建了一个独立的内存管理子系统，有自己的空闲页列表、使用中页列表、
+使用统计和锁来调解访问。此外，Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE
+中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求
+时要访问的区/节点。当一个区没有可用的内存来满足请求时，这种情况被称为“overflow 溢出”或
+“fallback 回退”。
+
+由于一些节点包含多个包含不同类型内存的区，Linux必须决定是否对区列表进行排序，使分配回退到不同
+节点上的相同区类型，或同一节点上的不同区类型。这是一个重要的考虑因素，因为有些区，如DMA或DMA32，
+代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距
+离排序的远程节点之前，它会尝试回退到同一节点的其他分区。
+
+默认情况下，Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说，Linux将试
+图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能
+满足请求，内核将检查所选分区列表中其他节点的区域，寻找列表中第一个能满足请求的区域。
+
+本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配
+一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构
+中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥
+远的调度域中。然而，调度器并没有直接考虑到任务的NUMA足迹。因此，在充分不平衡的情况下，任务可
+以在节点之间迁移，远离其初始节点和内核数据结构。
+
+系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口，如taskset(1)和numactl(1)，以及程
+序接口，如sched_setaffinity(2)，来限制任务的迁移，以改善NUMA定位。此外，人们可以使用
+Linux NUMA内存策略修改内核的默认本地分配行为。 [见
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
+
+系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点
+的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+在不隐藏无内存节点的架构上，Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无
+内存的节点，“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反，它
+将是内核在建立分区列表时选择的离它最近的有内存的节点。所以，默认情况下，本地分配将由内核提供
+最近的可用内存来完成。这是同一机制的结果，该机制允许这种分配在一个包含内存的节点溢出时回退到
+其他附近的节点。
+
+一些内核分配不希望或不能容忍这种分配回退行为。相反，他们想确保他们从指定的节点获得内存，或者
+得到通知说该节点没有空闲内存。例如，当一个子系统分配每个CPU的内存资源时，通常是这种情况。
+
+一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的
+节点ID，然后只从返回的节点ID请求内存。当这样的分配失败时，请求的子系统可以恢复到它自己的回退
+路径。板块内核内存分配器就是这样的一个例子。或者，子系统可以选择在分配失败时禁用或不启用自己。
+内核分析子系统就是这样的一个例子。
+
+如果架构支持——不隐藏无内存节点，那么连接到无内存节点的CPU将总是产生回退路径的开销，或者一些
+子系统如果试图完全从无内存的节点分配内存，将无法初始化。为了透明地支持这种架构，内核子系统可
+以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样，这是同
+一个节点，默认的本地页分配将从这个节点开始尝试。
diff --git a/Documentation/translations/zh_CN/mm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst
new file mode 100644
index 000000000000..d8452d8b7fbb
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst
@@ -0,0 +1,86 @@
+:Original: Documentation/mm/overcommit-accounting.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+==============
+超量使用审计
+==============
+
+Linux内核支持下列超量使用处理模式
+
+0
+	启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。
+	它确保严重的疯狂分配失败，同时允许超量使用以减少swap的使用。在这种模式下，
+	允许root分配稍多的内存。这是默认的。
+1
+	总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码，只是依赖
+	几乎完全由零页组成的虚拟内存
+
+2
+	不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量
+	（默认为50%）。根据你使用的数量，在大多数情况下，这意味着一个进程在访问页面时
+	不会被杀死，但会在内存分配上收到相应的错误。
+
+	对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说
+	是很有用的。
+
+超量使用策略是通过sysctl  `vm.overcommit_memory` 设置的。
+
+可以通过 `vm.overcommit_ratio` （百分比）或 `vm.overcommit_kbytes` （绝对值）
+来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。
+
+在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前
+的超量使用和提交量。
+
+陷阱
+====
+
+C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证，并在接近边缘的地方运行，
+你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说，这并
+不重要，但如果你真的非常关心的话，这就是一个值得关注的案例。
+
+
+在模式2中，MAP_NORESERVE标志被忽略。
+
+
+它是如何工作的
+==============
+
+超量使用是基于以下规则
+
+对于文件映射
+	| SHARED or READ-only	-	0 cost (该文件是映射而不是交换)
+	| PRIVATE WRITABLE	-	每个实例的映射大小
+
+对于匿名或者 ``/dev/zero`` 映射
+	| SHARED			-	映射的大小
+	| PRIVATE READ-only	-	0 cost (但作用不大)
+	| PRIVATE WRITABLE	-	每个实例的映射大小
+
+额外的计数
+	| 通过mmap制作可写副本的页面
+	| 从同一池中提取的shmfs内存
+
+状态
+====
+
+*	我们核算mmap内存映射
+*	我们核算mprotect在提交中的变化
+*	我们核算mremap的大小变化
+*	我们的审计 brk
+*	审计munmap
+*	我们在/proc中报告commit 状态
+*	核对并检查分叉的情况
+*	审查堆栈处理/执行中的构建
+*	叙述SHMfs的情况
+*	实现实际限制的执行
+
+待续
+====
+*	ptrace 页计数（这很难）。
diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst
new file mode 100644
index 000000000000..320952ca93af
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_frags.rst
@@ -0,0 +1,38 @@
+:Original: Documentation/mm/page_frag.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页面片段
+========
+
+一个页面片段是一个任意长度的任意偏移的内存区域，它位于一个0或更高阶的复合页面中。
+该页中的多个碎片在该页的引用计数器中被单独计算。
+
+page_frag函数，page_frag_alloc和page_frag_free，为页面片段提供了一个简单
+的分配框架。这被网络堆栈和网络设备驱动使用，以提供一个内存的支持区域，作为
+sk_buff->head使用，或者用于skb_shared_info的 “frags” 部分。
+
+为了使用页面片段API，需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点，
+并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用，
+这在分配时开销可能会很大。然而，由于这种缓存的性质，要求任何对缓存的调用都要受到每
+个CPU的限制，或者每个CPU的限制，并在执行碎片分配时强制禁止中断。
+
+网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用
+netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
+被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是
+它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用，因为这些函数
+将禁用中断，而 ”napi“ 前缀的函数只可以在softirq上下文中使用。
+
+许多网络设备驱动程序使用类似的方法来分配页面片段，但页面片段是在环或描述符级别上
+缓存的。为了实现这些情况，有必要提供一种拆解页面缓存的通用方法。出于这个原因，
+__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。
+这样做的好处是，它允许清理被添加到一个页面的多个引用，以避免每次分配都调用
+get_page。
+
+Alexander Duyck，2016年11月29日。
diff --git a/Documentation/translations/zh_CN/mm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst
new file mode 100644
index 000000000000..03d9e613094a
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_owner.rst
@@ -0,0 +1,116 @@
+:Original: Documentation/mm/page_owner.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+================================
+page owner: 跟踪谁分配的每个页面
+================================
+
+概述
+====
+
+page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。
+当分配发生时，有关分配的信息，如调用堆栈和页面的顺序被存储到每个页面的特定存储中。
+当我们需要了解所有页面的状态时，我们可以获得并分析这些信息。
+
+尽管我们已经有了追踪页面分配/释放的tracepoint，但用它来分析谁分配的每个页面是
+相当复杂的。我们需要扩大跟踪缓冲区，以防止在用户空间程序启动前出现重叠。而且，启
+动的程序会不断地将跟踪缓冲区转出，供以后分析，这将会改变系统的行为，会产生更多的
+可能性，而不是仅仅保留在内存中，所以不利于调试。
+
+页面所有者也可以用于各种目的。例如，可以通过每个页面的gfp标志信息获得精确的碎片
+统计。如果启用了page owner，它就已经实现并激活了。我们非常欢迎其他用途。
+
+page owner在默认情况下是禁用的。所以，如果你想使用它，你需要在你的启动cmdline
+中加入"page_owner=on"。如果内核是用page owner构建的，并且由于没有启用启动
+选项而在运行时禁用page owner，那么运行时的开销是很小的。如果在运行时禁用，它不
+需要内存来存储所有者信息，所以没有运行时内存开销。而且，页面所有者在页面分配器的
+热路径中只插入了两个不可能的分支，如果不启用，那么分配就会像没有页面所有者的内核
+一样进行。这两个不可能的分支应该不会影响到分配的性能，特别是在静态键跳转标签修补
+功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。
+
+- 没有page owner::
+
+   text    data     bss     dec     hex filename
+   48392   2333     644   51369    c8a9 mm/page_alloc.o
+
+- 有page owner::
+
+   text    data     bss     dec     hex filename
+   48800   2445     644   51889    cab1 mm/page_alloc.o
+   6662     108      29    6799    1a8f mm/page_owner.o
+   1025       8       8    1041     411 mm/page_ext.o
+
+虽然总共增加了8KB的代码，但page_alloc.o增加了520字节，其中不到一半是在hotpath
+中。构建带有page owner的内核，并在需要时打开它，将是调试内核内存问题的最佳选择。
+
+有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这
+个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些，所以，在初始化
+之前，许多页面可以被分配，但它们没有所有者信息。为了解决这个问题，这些早期分配的
+页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息，但至
+少，我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上，有13343
+个早期分配的页面被捕捉和标记，尽管它们大部分是由结构页扩展功能分配的。总之，在这
+之后，没有任何页面处于未追踪状态。
+
+使用方法
+========
+
+1) 构建用户空间的帮助::
+
+	cd tools/vm
+	make page_owner_sort
+
+2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline.
+
+3) 做你想调试的工作。
+
+4) 分析来自页面所有者的信息::
+
+	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+
+   ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值)::
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+	Page allocated via order XXX, ...
+	PFN XXX ...
+	// Detailed stack
+
+   ``page_owner_sort`` 工具忽略了 ``PFN`` 行，将剩余的行放在buf中，使用regexp提
+   取页序值，计算buf的次数和页数，最后根据参数进行排序。
+
+   在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出::
+
+	XXX times, XXX pages:
+	Page allocated via order XXX, ...
+	// Detailed stack
+
+   默认情况下， ``page_owner_sort`` 是根据buf的时间来排序的。如果你想
+   按buf的页数排序，请使用-m参数。详细的参数是:
+
+   基本函数:
+
+	Sort:
+		-a		按内存分配时间排序
+		-m		按总内存排序
+		-p		按pid排序。
+		-P		按tgid排序。
+		-r		按内存释放时间排序。
+		-s		按堆栈跟踪排序。
+		-t		按时间排序（默认）。
+
+   其它函数:
+
+	Cull:
+		-c		通过比较堆栈跟踪而不是总块来进行剔除。
+
+	Filter:
+		-f		过滤掉内存已被释放的块的信息。
diff --git a/Documentation/translations/zh_CN/mm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst
new file mode 100644
index 000000000000..e8077310a76c
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/page_table_check.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/mm/page_table_check.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页表检查
+========
+
+概述
+====
+
+页表检查允许通过确保防止某些类型的内存损坏来强化内核。
+
+当新的页面可以从用户空间访问时，页表检查通过将它们的页表项（PTEs PMD等）添加到页表中来执行额外
+的验证。
+
+在检测到损坏的情况下，内核会被崩溃。页表检查有一个小的性能和内存开销。因此，它在默认情况下是禁用
+的，但是在额外的加固超过性能成本的系统上，可以选择启用。另外，由于页表检查是同步的，它可以帮助调
+试双映射内存损坏问题，在错误的映射发生时崩溃内核，而不是在内存损坏错误发生后内核崩溃。
+
+双重映射检测逻辑
+================
+
++-------------------+-------------------+-------------------+------------------+
+| Current Mapping   | New mapping       | Permissions       | Rule             |
++===================+===================+===================+==================+
+| Anonymous         | Anonymous         | Read              | Allow            |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous         | Named             | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Anonymous         | Any               | Prohibit         |
++-------------------+-------------------+-------------------+------------------+
+| Named             | Named             | Any               | Allow            |
++-------------------+-------------------+-------------------+------------------+
+
+启用页表检查
+============
+
+用以下方法构建内核:
+
+- PAGE_TABLE_CHECK=y
+  注意，它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。
+
+- 使用 "page_table_check=on" 内核参数启动。
+
+可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核，以便在没有额外的内核参数的情况下获得页表
+支持。
diff --git a/Documentation/translations/zh_CN/mm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst
new file mode 100644
index 000000000000..31e0c54dc36f
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst
@@ -0,0 +1,32 @@
+:Original: Documentation/mm/remap_file_pages.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+==============================
+remap_file_pages()系统调用
+==============================
+
+remap_file_pages()系统调用被用来创建一个非线性映射，也就是说，在这个映射中，
+文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好
+处是，前者不需要内核创建额外的VMA（虚拟内存区）数据结构。
+
+支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码，包括热
+路径。另外，为了使非线性映射工作，内核需要一种方法来区分正常的页表项和带有文件
+偏移的项（pte_file）。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资
+源，特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。
+
+幸运的是，在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS
+实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。
+由于64位系统的广泛使用，这种使用情况已经不重要了。
+
+syscall被废弃了，现在用一个模拟来代替它。仿真会创建新的VMA，而不是非线性映射。
+对于remap_file_pages()的少数用户来说，它的工作速度会变慢，但ABI被保留了。
+
+仿真的一个副作用（除了性能之外）是，由于额外的VMA，用户可以更容易达到
+vm.max_map_count的限制。关于限制的更多细节，请参见DEFAULT_MAX_MAP_COUNT
+的注释。
diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
new file mode 100644
index 000000000000..4fb7aa666037
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst
@@ -0,0 +1,96 @@
+:Original: Documentation/mm/split_page_table_lock.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=================================
+分页表锁（split page table lock）
+=================================
+
+最初，mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方
+法导致了多线程应用程序的缺页异常可扩展性差，因为对锁的争夺很激烈。为了提高可扩
+展性，我们引入了分页表锁。
+
+有了分页表锁，我们就有了单独的每张表锁来顺序化对表的访问。目前，我们对PTE和
+PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。
+
+有一些辅助工具来锁定/解锁一个表和其他访问器函数:
+
+ - pte_offset_map_lock()
+	映射pte并获取PTE表锁，返回所取锁的指针；
+ - pte_unmap_unlock()
+	解锁和解映射PTE表；
+ - pte_alloc_map_lock()
+	如果需要的话，分配PTE表并获取锁，如果分配失败，返回已获取的锁的指针
+	或NULL；
+ - pte_lockptr()
+	返回指向PTE表锁的指针；
+ - pmd_lock()
+	取得PMD表锁，返回所取锁的指针。
+ - pmd_lockptr()
+	返回指向PMD表锁的指针；
+
+如果CONFIG_SPLIT_PTLOCK_CPUS（通常为4）小于或等于NR_CPUS，则在编译
+时启用PTE表的分页表锁。如果分页锁被禁用，所有的表都由mm->page_table_lock
+来保护。
+
+如果PMD表启用了分页锁，并且架构支持它，那么PMD表的分页锁就会被启用（见
+下文）。
+
+Hugetlb 和分页表锁
+==================
+
+Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁，但不对PUD使用。
+
+Hugetlb特定的辅助函数:
+
+ - huge_pte_lock()
+	对PMD_SIZE页面采取pmd分割锁，否则mm->page_table_lock；
+ - huge_pte_lockptr()
+	返回指向表锁的指针。
+
+架构对分页表锁的支持
+====================
+
+没有必要特别启用PTE分页表锁：所有需要的东西都由pgtable_pte_page_ctor()
+和pgtable_pte_page_dtor()完成，它们必须在PTE表分配/释放时被调用。
+
+确保架构不使用slab分配器来分配页表：slab使用page->slab_cache来分配其页
+面。这个区域与page->ptl共享存储。
+
+PMD分页锁只有在你有两个以上的页表级别时才有意义。
+
+启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor()，在释放时调
+用pgtable_pmd_page_dtor()。
+
+分配通常发生在pmd_alloc_one()中，释放发生在pmd_free()和pmd_free_tlb()
+中，但要确保覆盖所有的PMD表分配/释放路径：即X86_PAE在pgd_alloc()中预先
+分配一些PMD。
+
+一切就绪后，你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
+
+注意：pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
+须正确处理。
+
+page->ptl
+=========
+
+page->ptl用于访问分割页表锁，其中'page'是包含该表的页面struct page。它
+与page->private（以及union中的其他几个字段）共享存储。
+
+为了避免增加struct page的大小并获得最佳性能，我们使用了一个技巧:
+
+ - 如果spinlock_t适合于long，我们使用page->ptr作为spinlock，这样我们
+   就可以避免间接访问并节省一个缓存行。
+ - 如果spinlock_t的大小大于long的大小，我们使用page->ptl作为spinlock_t
+   的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
+   情况下使用分页锁，但由于间接访问而多花了一个缓存行。
+
+PTE表的spinlock_t分配在pgtable_pte_page_ctor()中，PMD表的spinlock_t
+分配在pgtable_pmd_page_ctor()中。
+
+请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/Documentation/translations/zh_CN/mm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst
new file mode 100644
index 000000000000..9569a6d88270
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/z3fold.rst
@@ -0,0 +1,31 @@
+:Original: Documentation/mm/z3fold.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+======
+z3fold
+======
+
+z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。
+它是zbud的衍生物，允许更高的压缩率，保持其前辈的简单性和确定性。
+
+z3fold和zbud的主要区别是:
+
+* 与zbud不同的是，z3fold允许最大的PAGE_SIZE分配。
+* z3fold在其页面中最多可以容纳3个压缩页面
+* z3fold本身没有输出任何API，因此打算通过zpool的API来使用
+
+为了保持确定性和简单性，z3fold，就像zbud一样，总是在每页存储一个整数的压缩页，但是
+它最多可以存储3页，不像zbud最多可以存储2页。因此压缩率达到2.7倍左右，而zbud的压缩
+率是1.7倍左右。
+
+不像zbud（但也像zsmalloc），z3fold_alloc()那样不返回一个可重复引用的指针。相反，它
+返回一个无符号长句柄，它编码了被分配对象的实际位置。
+
+保持有效的压缩率接近于zsmalloc，z3fold不依赖于MMU的启用，并提供更可预测的回收行
+为，这使得它更适合于小型和反应迅速的系统。
diff --git a/Documentation/translations/zh_CN/mm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst
new file mode 100644
index 000000000000..b5596ea08ae4
--- /dev/null
+++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst
@@ -0,0 +1,78 @@
+:Original: Documentation/mm/zs_malloc.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+========
+zsmalloc
+========
+
+这个分配器是为与zram一起使用而设计的。因此，该分配器应该在低内存条件下工作良好。特别是，
+它从未尝试过higher order页面的分配，这在内存压力下很可能会失败。另一方面，如果我们只
+是使用单（0-order）页，它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将
+占据整个页面。这是其前身（xvmalloc）的主要问题之一。
+
+为了克服这些问题，zsmalloc分配了一堆0-order页面，并使用各种"struct page"字段将它
+们链接起来。这些链接的页面作为一个单一的higher order页面，即一个对象可以跨越0-order
+页面的边界。代码将这些链接的页面作为一个实体，称为zspage。
+
+为了简单起见，zsmalloc只能分配大小不超过PAGE_SIZE的对象，因为这满足了所有当前用户的
+要求（在最坏的情况下，页面是不可压缩的，因此以"原样"即未压缩的形式存储）。对于大于这
+个大小的分配请求，会返回失败（见zs_malloc）。
+
+此外，zs_malloc()并不返回一个可重复引用的指针。相反，它返回一个不透明的句柄（无符号
+长），它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久
+映射，因为这在32位系统上会导致问题，因为内核空间映射的VA区域非常小。因此，在使用分配
+的内存之前，对象必须使用zs_map_object()进行映射以获得一个可用的指针，随后使用
+zs_unmap_object()解除映射。
+
+stat
+====
+
+通过CONFIG_ZSMALLOC_STAT，我们可以通过 ``/sys/kernel/debug/zsmalloc/<user name>``
+看到zsmalloc内部信息。下面是一个统计输出的例子。::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
+    ...
+    ...
+     9   176           0            1           186        129          8                4
+    10   192           1            0          2880       2872        135                3
+    11   208           0            1           819        795         42                2
+    12   224           0            1           219        159         12                4
+    ...
+    ...
+
+
+class
+	索引
+size
+	zspage存储对象大小
+almost_empty
+	ZS_ALMOST_EMPTY zspage的数量（见下文）。
+almost_full
+	ZS_ALMOST_FULL zspage的数量(见下图)
+obj_allocated
+	已分配对象的数量
+obj_used
+	分配给用户的对象的数量
+pages_used
+	为该类分配的页数
+pages_per_zspage
+	组成一个zspage的0-order页面的数量
+
+当n <= N / f时，我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组，其中
+
+* n = 已分配对象的数量
+* N = zspage可以存储的对象总数
+* f = fullness_threshold_frac(即，目前是4个)
+
+同样地，我们将zspage分配给:
+
+* ZS_ALMOST_FULL  when n > N / f
+* ZS_EMPTY        when n == 0
+* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/vm/active_mm.rst
deleted file mode 100644
index 366609ea4f37..000000000000
--- a/Documentation/translations/zh_CN/vm/active_mm.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/active_mm.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=========
-Active MM
-=========
-
-这是一封linux之父回复开发者的一封邮件，所以翻译时我尽量保持邮件格式的完整。
-
-::
-
- List:       linux-kernel
- Subject:    Re: active_mm
- From:       Linus Torvalds <torvalds () transmeta ! com>
- Date:       1999-07-30 21:36:24
-
- 因为我并不经常写解释，所以已经抄送到linux-kernel邮件列表，而当我做这些，
- 且更多的人在阅读它们时，我觉得棒极了。
-
- 1999年7月30日 星期五， David Mosberger 写道：
- >
- > 是否有一个简短的描述，说明task_struct中的
- >  "mm" 和 "active_mm"应该如何使用？ (如果
- > 这个问题在邮件列表中讨论过，我表示歉意--我刚
- > 刚度假回来，有一段时间没能关注linux-kernel了）。
-
- 基本上，新的设定是：
-
-  - 我们有“真实地址空间”和“匿名地址空间”。区别在于，匿名地址空间根本不关心用
-    户级页表，所以当我们做上下文切换到匿名地址空间时，我们只是让以前的地址空间
-    处于活动状态。
-
-    一个“匿名地址空间”的明显用途是任何不需要任何用户映射的线程--所有的内核线
-    程基本上都属于这一类，但即使是“真正的”线程也可以暂时说在一定时间内它们不
-    会对用户空间感兴趣，调度器不妨试着避免在切换VM状态上浪费时间。目前只有老
-    式的bdflush sync能做到这一点。
-
-  - “tsk->mm” 指向 “真实地址空间”。对于一个匿名进程来说，tsk->mm将是NULL，
-    其逻辑原因是匿名进程实际上根本就 “没有” 真正的地址空间。
-
-  - 然而，我们显然需要跟踪我们为这样的匿名用户“偷用”了哪个地址空间。为此，我们
-    有 “tsk->active_mm”，它显示了当前活动的地址空间是什么。
-
-    规则是，对于一个有真实地址空间的进程（即tsk->mm是 non-NULL），active_mm
-    显然必须与真实的mm相同。
-
-    对于一个匿名进程，tsk->mm == NULL，而tsk->active_mm是匿名进程运行时
-    “借用”的mm。当匿名进程被调度走时，借用的地址空间被返回并清除。
-
- 为了支持所有这些，“struct mm_struct”现在有两个计数器：一个是 “mm_users”
- 计数器，即有多少 “真正的地址空间用户”，另一个是 “mm_count”计数器，即 “lazy”
- 用户（即匿名用户）的数量，如果有任何真正的用户，则加1。
-
- 通常情况下，至少有一个真正的用户，但也可能是真正的用户在另一个CPU上退出，而
- 一个lazy的用户仍在活动，所以你实际上得到的情况是，你有一个地址空间 **只**
- 被lazy的用户使用。这通常是一个短暂的生命周期状态，因为一旦这个线程被安排给一
- 个真正的线程，这个 “僵尸” mm就会被释放，因为 “mm_count”变成了零。
-
- 另外，一个新的规则是，**没有人** 再把 “init_mm” 作为一个真正的MM了。
- “init_mm”应该被认为只是一个 “没有其他上下文时的lazy上下文”，事实上，它主
- 要是在启动时使用，当时还没有真正的VM被创建。因此，用来检查的代码
-
-   if (current->mm == &init_mm)
-
- 一般来说，应该用
-
-   if (!current->mm)
-
- 取代上面的写法（这更有意义--测试基本上是 “我们是否有一个用户环境”，并且通常
- 由缺页异常处理程序和类似的东西来完成）。
-
- 总之，我刚才在ftp.kernel.org上放了一个pre-patch-2.3.13-1，因为它稍微改
- 变了接口以适配alpha（谁会想到呢，但alpha体系结构上下文切换代码实际上最终是
- 最丑陋的之一--不像其他架构的MM和寄存器状态是分开的，alpha的PALcode将两者
- 连接起来，你需要同时切换两者）。
-
- (文档来源 http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/vm/balance.rst
deleted file mode 100644
index e98a47ef24a8..000000000000
--- a/Documentation/translations/zh_CN/vm/balance.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/balance.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-内存平衡
-========
-
-2000年1月开始，作者：Kanoj Sarcar <kanoj@sgi.com>
-
-对于 !__GFP_HIGH 和 !__GFP_KSWAPD_RECLAIM 以及非 __GFP_IO 的分配，需要进行
-内存平衡。
-
-调用者避免回收的第一个原因是调用者由于持有自旋锁或处于中断环境中而无法睡眠。第二个
-原因可能是，调用者愿意在不产生页面回收开销的情况下分配失败。这可能发生在有0阶回退
-选项的机会主义高阶分配请求中。在这种情况下，调用者可能也希望避免唤醒kswapd。
-
-__GFP_IO分配请求是为了防止文件系统死锁。
-
-在没有非睡眠分配请求的情况下，做平衡似乎是有害的。页面回收可以被懒散地启动，也就是
-说，只有在需要的时候（也就是区域的空闲内存为0），而不是让它成为一个主动的过程。
-
-也就是说，内核应该尝试从直接映射池中满足对直接映射页的请求，而不是回退到dma池中，
-这样就可以保持dma池为dma请求（不管是不是原子的）所填充。类似的争论也适用于高内存
-和直接映射的页面。相反，如果有很多空闲的dma页，最好是通过从dma池中分配一个来满足
-常规的内存请求，而不是产生常规区域平衡的开销。
-
-在2.2中，只有当空闲页总数低于总内存的1/64时，才会启动内存平衡/页面回收。如果dma
-和常规内存的比例合适，即使dma区完全空了，也很可能不会进行平衡。2.2已经在不同内存
-大小的生产机器上运行，即使有这个问题存在，似乎也做得不错。在2.3中，由于HIGHMEM的
-存在，这个问题变得更加严重。
-
-在2.3中，区域平衡可以用两种方式之一来完成：根据区域的大小（可能是低级区域的大小），
-我们可以在初始化阶段决定在平衡任何区域时应该争取多少空闲页。好的方面是，在平衡的时
-候，我们不需要看低级区的大小，坏的方面是，我们可能会因为忽略低级区可能较低的使用率
-而做过于频繁的平衡。另外，只要对分配程序稍作修改，就有可能将memclass()宏简化为一
-个简单的等式。
-
-另一个可能的解决方案是，我们只在一个区 **和** 其所有低级区的空闲内存低于该区及其
-低级区总内存的1/64时进行平衡。这就解决了2.2的平衡问题，并尽可能地保持了与2.2行为
-的接近。另外，平衡算法在各种架构上的工作方式也是一样的，这些架构有不同数量和类型的
-内存区。如果我们想变得更花哨一点，我们可以在未来为不同区域的自由页面分配不同的权重。
-
-请注意，如果普通区的大小与dma区相比是巨大的，那么在决定是否平衡普通区的时候，考虑
-空闲的dma页就变得不那么重要了。那么第一个解决方案就变得更有吸引力。
-
-所附的补丁实现了第二个解决方案。它还 “修复”了两个问题：首先，在低内存条件下，kswapd
-被唤醒，就像2.2中的非睡眠分配。第二，HIGHMEM区也被平衡了，以便给replace_with_highmem()
-一个争取获得HIGHMEM页的机会，同时确保HIGHMEM分配不会落回普通区。这也确保了HIGHMEM
-页不会被泄露（例如，在一个HIGHMEM页在交换缓存中但没有被任何人使用的情况下）。
-
-kswapd还需要知道它应该平衡哪些区。kswapd主要是在无法进行平衡的情况下需要的，可能
-是因为所有的分配请求都来自中断上下文，而所有的进程上下文都在睡眠。对于2.3，
-kswapd并不真正需要平衡高内存区，因为中断上下文并不请求高内存页。kswapd看zone
-结构体中的zone_wake_kswapd字段来决定一个区是否需要平衡。
-
-如果从进程内存和shm中偷取页面可以减轻该页面节点中任何区的内存压力，而该区的内存压力
-已经低于其水位，则会进行偷取。
-
-watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd：
-这些是每个区的字段，用于确定一个区何时需要平衡。当页面数低于水位[WMARK_MIN]时，
-hysteric 的字段low_on_memory被设置。这个字段会一直被设置，直到空闲页数变成水位
-[WMARK_HIGH]。当low_on_memory被设置时，页面分配请求将尝试释放该区域的一些页面（如果
-请求中设置了GFP_WAIT）。与此相反的是，决定唤醒kswapd以释放一些区的页。这个决定不是基于
-hysteresis 的，而是当空闲页的数量低于watermark[WMARK_LOW]时就会进行；在这种情况下，
-zone_wake_kswapd也被设置。
-
-
-我所听到的（超棒的）想法：
-
-1. 动态经历应该影响平衡：可以跟踪一个区的失败请求的数量，并反馈到平衡方案中（jalvo@mbay.net）。
-
-2. 实现一个类似于replace_with_highmem()的replace_with_regular()，以保留dma页面。
-   (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/vm/damon/api.rst
deleted file mode 100644
index 21143eea4ebe..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/api.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/api.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=======
-API参考
-=======
-
-内核空间的程序可以使用下面的API来使用DAMON的每个功能。你所需要做的就是引用 ``damon.h`` ，
-它位于源代码树的include/linux/。
-
-结构体
-======
-
-该API在以下内核代码中:
-
-include/linux/damon.h
-
-
-函数
-====
-
-该API在以下内核代码中:
-
-mm/damon/core.c
diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst
deleted file mode 100644
index 46128b77c2b3..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/design.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/design.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-====
-设计
-====
-
-可配置的层
-==========
-
-DAMON提供了数据访问监控功能，同时使其准确性和开销可控。基本的访问监控需要依赖于目标地址空间
-并为之优化的基元。另一方面，作为DAMON的核心，准确性和开销的权衡机制是在纯逻辑空间中。DAMON
-将这两部分分离在不同的层中，并定义了它的接口，以允许各种低层次的基元实现与核心逻辑的配置。
-
-由于这种分离的设计和可配置的接口，用户可以通过配置核心逻辑和适当的低级基元实现来扩展DAMON的
-任何地址空间。如果没有提供合适的，用户可以自己实现基元。
-
-例如，物理内存、虚拟内存、交换空间、那些特定的进程、NUMA节点、文件和支持的内存设备将被支持。
-另外，如果某些架构或设备支持特殊的优化访问检查基元，这些基元将很容易被配置。
-
-
-特定地址空间基元的参考实现
-==========================
-
-基本访问监测的低级基元被定义为两部分。:
-
-1. 确定地址空间的监测目标地址范围
-2. 目标空间中特定地址范围的访问检查。
-
-DAMON目前为物理和虚拟地址空间提供了基元的实现。下面两个小节描述了这些工作的方式。
-
-
-基于VMA的目标地址范围构造
--------------------------
-
-这仅仅是针对虚拟地址空间基元的实现。对于物理地址空间，只是要求用户手动设置监控目标地址范围。
-
-在进程的超级巨大的虚拟地址空间中，只有小部分被映射到物理内存并被访问。因此，跟踪未映射的地
-址区域只是一种浪费。然而，由于DAMON可以使用自适应区域调整机制来处理一定程度的噪声，所以严
-格来说，跟踪每一个映射并不是必须的，但在某些情况下甚至会产生很高的开销。也就是说，监测目标
-内部过于巨大的未映射区域应该被移除，以不占用自适应机制的时间。
-
-出于这个原因，这个实现将复杂的映射转换为三个不同的区域，覆盖地址空间的每个映射区域。这三个
-区域之间的两个空隙是给定地址空间中两个最大的未映射区域。这两个最大的未映射区域是堆和最上面
-的mmap()区域之间的间隙，以及在大多数情况下最下面的mmap()区域和堆之间的间隙。因为这些间隙
-在通常的地址空间中是异常巨大的，排除这些间隙就足以做出合理的权衡。下面详细说明了这一点::
-
-    <heap>
-    <BIG UNMAPPED REGION 1>
-    <uppermost mmap()-ed region>
-    (small mmap()-ed regions and munmap()-ed regions)
-    <lowermost mmap()-ed region>
-    <BIG UNMAPPED REGION 2>
-    <stack>
-
-
-基于PTE访问位的访问检查
------------------------
-
-物理和虚拟地址空间的实现都使用PTE Accessed-bit进行基本访问检查。唯一的区别在于从地址中
-找到相关的PTE访问位的方式。虚拟地址的实现是为该地址的目标任务查找页表，而物理地址的实现则
-是查找与该地址有映射关系的每一个页表。通过这种方式，实现者找到并清除下一个采样目标地址的位，
-并检查该位是否在一个采样周期后再次设置。这可能会干扰其他使用访问位的内核子系统，即空闲页跟
-踪和回收逻辑。为了避免这种干扰，DAMON使其与空闲页面跟踪相互排斥，并使用 ``PG_idle`` 和
-``PG_young`` 页面标志来解决与回收逻辑的冲突，就像空闲页面跟踪那样。
-
-
-独立于地址空间的核心机制
-========================
-
-下面四个部分分别描述了DAMON的核心机制和五个监测属性，即 ``采样间隔`` 、 ``聚集间隔`` 、
-``更新间隔`` 、 ``最小区域数`` 和 ``最大区域数`` 。
-
-
-访问频率监测
-------------
-
-DAMON的输出显示了在给定的时间内哪些页面的访问频率是多少。访问频率的分辨率是通过设置
-``采样间隔`` 和 ``聚集间隔`` 来控制的。详细地说，DAMON检查每个 ``采样间隔`` 对每
-个页面的访问，并将结果汇总。换句话说，计算每个页面的访问次数。在每个 ``聚合间隔`` 过
-去后，DAMON调用先前由用户注册的回调函数，以便用户可以阅读聚合的结果，然后再清除这些结
-果。这可以用以下简单的伪代码来描述::
-
-    while monitoring_on:
-        for page in monitoring_target:
-            if accessed(page):
-                nr_accesses[page] += 1
-        if time() % aggregation_interval == 0:
-            for callback in user_registered_callbacks:
-                callback(monitoring_target, nr_accesses)
-            for page in monitoring_target:
-                nr_accesses[page] = 0
-        sleep(sampling interval)
-
-这种机制的监测开销将随着目标工作负载规模的增长而任意增加。
-
-
-基于区域的抽样调查
-------------------
-
-为了避免开销的无限制增加，DAMON将假定具有相同访问频率的相邻页面归入一个区域。只要保持
-这个假设（一个区域内的页面具有相同的访问频率），该区域内就只需要检查一个页面。因此，对
-于每个 ``采样间隔`` ，DAMON在每个区域中随机挑选一个页面，等待一个 ``采样间隔`` ，检
-查该页面是否同时被访问，如果被访问则增加该区域的访问频率。因此，监测开销是可以通过设置
-区域的数量来控制的。DAMON允许用户设置最小和最大的区域数量来进行权衡。
-
-然而，如果假设没有得到保证，这个方案就不能保持输出的质量。
-
-
-适应性区域调整
---------------
-
-即使最初的监测目标区域被很好地构建以满足假设（同一区域内的页面具有相似的访问频率），数
-据访问模式也会被动态地改变。这将导致监测质量下降。为了尽可能地保持假设，DAMON根据每个
-区域的访问频率自适应地进行合并和拆分。
-
-对于每个 ``聚集区间`` ，它比较相邻区域的访问频率，如果频率差异较小，就合并这些区域。
-然后，在它报告并清除每个区域的聚合接入频率后，如果区域总数不超过用户指定的最大区域数，
-它将每个区域拆分为两个或三个区域。
-
-通过这种方式，DAMON提供了其最佳的质量和最小的开销，同时保持了用户为其权衡设定的界限。
-
-
-动态目标空间更新处理
---------------------
-
-监测目标地址范围可以动态改变。例如，虚拟内存可以动态地被映射和解映射。物理内存可以被
-热插拔。
-
-由于在某些情况下变化可能相当频繁，DAMON允许监控操作检查动态变化，包括内存映射变化，
-并仅在用户指定的时间间隔（ ``更新间隔`` ）中的每个时间段，将其应用于监控操作相关的
-数据结构，如抽象的监控目标内存区。
\ No newline at end of file
diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/vm/damon/faq.rst
deleted file mode 100644
index 07b4ac19407d..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/faq.rst
+++ /dev/null
@@ -1,48 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/faq.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-常见问题
-========
-
-为什么是一个新的子系统，而不是扩展perf或其他用户空间工具？
-==========================================================
-
-首先，因为它需要尽可能的轻量级，以便可以在线使用，所以应该避免任何不必要的开销，如内核-用户
-空间的上下文切换成本。第二，DAMON的目标是被包括内核在内的其他程序所使用。因此，对特定工具
-（如perf）的依赖性是不可取的。这就是DAMON在内核空间实现的两个最大的原因。
-
-
-“闲置页面跟踪” 或 “perf mem” 可以替代DAMON吗？
-==============================================
-
-闲置页跟踪是物理地址空间访问检查的一个低层次的原始方法。“perf mem”也是类似的，尽管它可以
-使用采样来减少开销。另一方面，DAMON是一个更高层次的框架，用于监控各种地址空间。它专注于内
-存管理优化，并提供复杂的精度/开销处理机制。因此，“空闲页面跟踪” 和 “perf mem” 可以提供
-DAMON输出的一个子集，但不能替代DAMON。
-
-
-DAMON是否只支持虚拟内存？
-=========================
-
-不，DAMON的核心是独立于地址空间的。用户可以在DAMON核心上实现和配置特定地址空间的低级原始
-部分，包括监测目标区域的构造和实际的访问检查。通过这种方式，DAMON用户可以用任何访问检查技
-术来监测任何地址空间。
-
-尽管如此，DAMON默认为虚拟内存和物理内存提供了基于vma/rmap跟踪和PTE访问位检查的地址空间
-相关功能的实现，以供参考和方便使用。
-
-
-我可以简单地监测页面的粒度吗？
-==============================
-
-是的，你可以通过设置 ``min_nr_regions`` 属性高于工作集大小除以页面大小的值来实现。
-因为监视目标区域的大小被强制为 ``>=page size`` ，所以区域分割不会产生任何影响。
diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/vm/damon/index.rst
deleted file mode 100644
index 84d36d90c9b0..000000000000
--- a/Documentation/translations/zh_CN/vm/damon/index.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/damon/index.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-==========================
-DAMON:数据访问监视器
-==========================
-
-DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为
-（该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)）
-
- - *准确度* （监测输出对DRAM级别的内存管理足够有用；但可能不适合CPU Cache级别），
- - *轻量级* （监控开销低到可以在线应用），以及
- - *可扩展* （无论目标工作负载的大小，开销的上限值都在恒定范围内）。
-
-因此，利用这个框架，内核的内存管理机制可以做出高级决策。会导致高数据访问监控开销的实
-验性内存管理优化工作可以再次进行。同时，在用户空间，有一些特殊工作负载的用户可以编写
-个性化的应用程序，以便更好地了解和优化他们的工作负载和系统。
-
-.. toctree::
-   :maxdepth: 2
-
-   faq
-   design
-   api
-
diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/vm/free_page_reporting.rst
deleted file mode 100644
index 31d6c34b956b..000000000000
--- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/_free_page_reporting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==========
-空闲页报告
-==========
-
-空闲页报告是一个API，设备可以通过它来注册接收系统当前未使用的页面列表。这在虚拟
-化的情况下是很有用的，客户机能够使用这些数据来通知管理器它不再使用内存中的某些页
-面。
-
-对于驱动，通常是气球驱动要使用这个功能，它将分配和初始化一个page_reporting_dev_info
-结构体。它要填充的结构体中的字段是用于处理散点列表的 "report" 函数指针。它还必
-须保证每次调用该函数时能处理至少相当于PAGE_REPORTING_CAPACITY的散点列表条目。
-假设没有其他页面报告设备已经注册， 对page_reporting_register的调用将向报告框
-架注册页面报告接口。
-
-一旦注册，页面报告API将开始向驱动报告成批的页面。API将在接口被注册后2秒开始报告
-页面，并在任何足够高的页面被释放之后2秒继续报告。
-
-报告的页面将被存储在传递给报告函数的散列表中，最后一个条目的结束位被设置在条目
-nent-1中。 当页面被报告函数处理时，分配器将无法访问它们。一旦报告函数完成，这些
-页将被返回到它们所获得的自由区域。
-
-在移除使用空闲页报告的驱动之前，有必要调用page_reporting_unregister，以移除
-目前被空闲页报告使用的page_reporting_dev_info结构体。这样做将阻止进一步的报
-告通过该接口发出。如果另一个驱动或同一驱动被注册，它就有可能恢复前一个驱动在报告
-空闲页方面的工作。
-
-
-Alexander Duyck, 2019年12月04日
diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst
deleted file mode 100644
index 3eb07870e2ef..000000000000
--- a/Documentation/translations/zh_CN/vm/frontswap.rst
+++ /dev/null
@@ -1,196 +0,0 @@
-:Original: Documentation/vm/_free_page_reporting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-=========
-Frontswap
-=========
-
-Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中，由
-于交换页被保存在RAM（或类似RAM的设备）中，而不是交换磁盘，因此可以获得巨大的性能
-节省（提高）。
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap之所以这么命名，是因为它可以被认为是与swap设备的“back”存储相反。存
-储器被认为是一个同步并发安全的面向页面的“伪RAM设备”，符合transcendent memory
-（如Xen的“tmem”，或内核内压缩内存，又称“zcache”，或未来的类似RAM的设备）的要
-求；这个伪RAM设备不能被内核直接访问或寻址，其大小未知且可能随时间变化。驱动程序通过
-调用frontswap_register_ops将自己与frontswap链接起来，以适当地设置frontswap_ops
-的功能，它提供的功能必须符合某些策略，如下所示:
-
-一个 “init” 将设备准备好接收与指定的交换设备编号（又称“类型”）相关的frontswap
-交换页。一个 “store” 将把该页复制到transcendent memory，并与该页的类型和偏移
-量相关联。一个 “load” 将把该页，如果找到的话，从transcendent memory复制到内核
-内存，但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从
-transcendent memory中删除该页，一个 “invalidate_area” 将删除所有与交换类型
-相关的页（例如，像swapoff）并通知 “device” 拒绝进一步存储该交换类型。
-
-一旦一个页面被成功存储，在该页面上的匹配加载通常会成功。因此，当内核发现自己处于需
-要交换页面的情况时，它首先尝试使用frontswap。如果存储的结果是成功的，那么数据就已
-经成功的保存到了transcendent memory中，并且避免了磁盘写入，如果后来再读回数据，
-也避免了磁盘读取。如果存储返回失败，transcendent memory已经拒绝了该数据，且该页
-可以像往常一样被写入交换空间。
-
-请注意，如果一个页面被存储，而该页面已经存在于transcendent memory中（一个 “重复”
-的存储），要么存储成功，数据被覆盖，要么存储失败，该页面被废止。这确保了旧的数据永远
-不会从frontswap中获得。
-
-如果配置正确，对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
-debugfs完成的。frontswap的有效性可以通过以下方式测量（在所有交换设备中）:
-
-``failed_stores``
-	有多少次存储的尝试是失败的
-
-``loads``
-	尝试了多少次加载（应该全部成功）
-
-``succ_stores``
-	有多少次存储的尝试是成功的
-
-``invalidates``
-	尝试了多少次作废
-
-后台实现可以提供额外的指标。
-
-经常问到的问题
-==============
-
-* 价值在哪里?
-
-当一个工作负载开始交换时，性能就会下降。Frontswap通过提供一个干净的、动态的接口来
-读取和写入交换页到 “transcendent memory”，从而大大增加了许多这样的工作负载的性
-能，否则内核是无法直接寻址的。当数据被转换为不同的形式和大小（比如压缩）或者被秘密
-移动（对于一些类似RAM的设备来说，这可能对写平衡很有用）时，这个接口是理想的。交换
-页（和被驱逐的页面缓存页）是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。
-
-Frontswap对内核的影响相当小，为各种系统配置中更动态、更灵活的RAM利用提供了巨大的
-灵活性：
-
-在单一内核的情况下，又称“zcache”，页面被压缩并存储在本地内存中，从而增加了可以安
-全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利
-用率。Benchmarks测试显示，当内存压力较低时，几乎没有影响，而在高内存压力下的一些
-工作负载上，则有明显的性能改善（25%以上）。
-
-“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory
-的支持。Frontswap页面像zcache一样被本地压缩，但随后被“remotified” 到另一个系
-统的RAM。这使得RAM可以根据需要动态地来回负载平衡，也就是说，当系统A超载时，它可以
-交换到系统B，反之亦然。RAMster也可以被配置成一个内存服务器，因此集群中的许多服务器
-可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户
-有多少内存可用
-
-在虚拟情况下，虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复
-用。对于RAM来说，这真的很难做到，而且在不改变内核的情况下，要做好这一点的努力基本上
-是失败的（除了一些广为人知的特殊情况下的工作负载）。具体来说，Xen Transcendent Memory
-后端允许管理器拥有的RAM “fallow”，不仅可以在多个虚拟机之间进行“time-shared”，
-而且页面可以被压缩和重复利用，以优化RAM的利用率。当客户操作系统被诱导交出未充分利用
-的RAM时（如 “selfballooning”），突然出现的意外内存压力可能会导致交换；frontswap
-允许这些页面被交换到管理器RAM中或从管理器RAM中交换（如果整体主机系统内存条件允许），
-从而减轻计划外交换可能带来的可怕的性能影响。
-
-一个KVM的实现正在进行中，并且已经被RFC'ed到lkml。而且，利用frontswap，对NVM作为
-内存扩展技术的调查也在进行中。
-
-* 当然，在某些情况下可能有性能上的优势，但frontswap的空间/时间开销是多少？
-
-如果 CONFIG_FRONTSWAP 被禁用，每个 frontswap 钩子都会编译成空，唯一的开销是每
-个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用，但没有
-frontswap的 “backend” 寄存器，每读或写一个交换页就会有一个额外的全局变量，而不
-是零。如果 CONFIG_FRONTSWAP 被启用，并且有一个frontswap的backend寄存器，并且
-后端每次 “store” 请求都失败（即尽管声称可能，但没有提供内存），CPU 的开销仍然可以
-忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前，系统很可能是 I/O 绑定
-的，无论如何使用一小部分的 CPU 都是不相关的。
-
-至于空间，如果CONFIG_FRONTSWAP被启用，并且有一个frontswap的backend注册，那么
-每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换
-页分配的8位（在2.6.34之前是16位）上增加的。(Hugh Dickins观察到，frontswap可能
-会偷取现有的8个比特，但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的
-非常大的交换盘（这很罕见），这是每32GB交换盘1MB开销。
-
-当交换页存储在transcendent memory中而不是写到磁盘上时，有一个副作用，即这可能会
-产生更多的内存压力，有可能超过其他的优点。一个backend，比如zcache，必须实现策略
-来仔细（但动态地）管理内存限制，以确保这种情况不会发生。
-
-* 好吧，那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何？
-
-我们假设在内核初始化过程中，一个frontswap 的 “backend” 已经注册了；这个注册表
-明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提
-供了多少内存是完全动态和随机的。
-
-每当一个交换设备被交换时，就会调用frontswap_init()，把交换设备的编号（又称“类
-型”）作为一个参数传给它。这就通知了frontswap，以期待 “store” 与该号码相关的交
-换页的尝试。
-
-每当交换子系统准备将一个页面写入交换设备时（参见swap_writepage()），就会调用
-frontswap_store。Frontswap与frontswap backend协商，如果backend说它没有空
-间，frontswap_store返回-1，内核就会照常把页换到交换设备上。注意，来自frontswap
-backend的响应对内核来说是不可预测的；它可能选择从不接受一个页面，可能接受每九个
-页面，也可能接受每一个页面。但是如果backend确实接受了一个页面，那么这个页面的数
-据已经被复制并与类型和偏移量相关联了，而且backend保证了数据的持久性。在这种情况
-下，frontswap在交换设备的“frontswap_map” 中设置了一个位，对应于交换设备上的
-页面偏移量，否则它就会将数据写入该设备。
-
-当交换子系统需要交换一个页面时（swap_readpage()），它首先调用frontswap_load()，
-检查frontswap_map，看这个页面是否早先被frontswap backend接受。如果是，该页
-的数据就会从frontswap后端填充，换入就完成了。如果不是，正常的交换代码将被执行，
-以便从真正的交换设备上获得这一页的数据。
-
-所以每次frontswap backend接受一个页面时，交换设备的读取和（可能）交换设备的写
-入都被 “frontswap backend store” 和（可能）“frontswap backend loads”
-所取代，这可能会快得多。
-
-* frontswap不能被配置为一个 “特殊的” 交换设备，它的优先级要高于任何真正的交换
-  设备（例如像zswap，或者可能是swap-over-nbd/NFS）？
-
-首先，现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次
-结构，但这将需要相当大的改变。即使它被重写，现有的交换子系统也使用了块I/O层，它
-假定交换设备是固定大小的，其中的任何页面都是可线性寻址的。Frontswap几乎没有触
-及现有的交换子系统，而是围绕着块I/O子系统的限制，提供了大量的灵活性和动态性。
-
-例如，frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend
-的定义至关重要，因为它赋予了backend完全动态的决定权。在zcache中，人们无法预
-先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝，而 “差” 本身也可
-以根据当前的内存限制动态地定义。
-
-此外，frontswap是完全同步的，而真正的交换设备，根据定义，是异步的，并且使用
-块I/O。块I/O层不仅是不必要的，而且可能进行 “优化”，这对面向RAM的设备来说是
-不合适的，包括将一些页面的写入延迟相当长的时间。同步是必须的，以确保后端的动
-态性，并避免棘手的竞争条件，这将不必要地大大增加frontswap和/或块I/O子系统的
-复杂性。也就是说，只有最初的 “store” 和 “load” 操作是需要同步的。一个独立
-的异步线程可以自由地操作由frontswap存储的页面。例如，RAMster中的 “remotification”
-线程使用标准的异步内核套接字，将压缩的frontswap页面移动到远程机器。同样，
-KVM的客户方实现可以进行客户内压缩，并使用 “batched” hypercalls。
-
-在虚拟化环境中，动态性允许管理程序（或主机操作系统）做“intelligent overcommit”。
-例如，它可以选择只接受页面，直到主机交换可能即将发生，然后强迫客户机做他们
-自己的交换。
-
-transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可
-能失败，所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此，
-frontswap必须作为每个交换设备的 “影子” 来实现，它有可能容纳交换设备可能
-容纳的每一个页面，也有可能根本不容纳任何页面。这意味着frontswap不能包含比
-swap设备总数更多的页面。例如，如果在某些安装上没有配置交换设备，frontswap
-就没有用。无交换设备的便携式设备仍然可以使用frontswap，但是这种设备的
-backend必须配置某种 “ghost” 交换设备，并确保它永远不会被使用。
-
-
-* 为什么会有这种关于 “重复存储” 的奇怪定义？如果一个页面以前被成功地存储过，
-  难道它不能总是被成功地覆盖吗？
-
-几乎总是可以的，不，有时不能。考虑一个例子，数据被压缩了，原来的4K页面被压
-缩到了1K。现在，有人试图用不可压缩的数据覆盖该页，因此会占用整个4K。但是
-backend没有更多的空间了。在这种情况下，这个存储必须被拒绝。每当frontswap
-拒绝一个会覆盖的存储时，它也必须使旧的数据作废，并确保它不再被访问。因为交
-换子系统会把新的数据写到读交换设备上，这是确保一致性的正确做法。
-
-* 为什么frontswap补丁会创建新的头文件swapfile.h？
-
-frontswap代码依赖于一些swap子系统内部的数据结构，这些数据结构多年来一直
-在静态和全局之间来回移动。这似乎是一个合理的妥协：将它们定义为全局，但在一
-个新的包含文件中声明它们，该文件不被包含swap.h的大量源文件所包含。
-
-Dan Magenheimer，最后更新于2012年4月9日
diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/vm/highmem.rst
deleted file mode 100644
index 018838e58c3e..000000000000
--- a/Documentation/translations/zh_CN/vm/highmem.rst
+++ /dev/null
@@ -1,128 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/highmem.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==========
-高内存处理
-==========
-
-作者: Peter Zijlstra <a.p.zijlstra@chello.nl>
-
-.. contents:: :local:
-
-高内存是什么？
-==============
-
-当物理内存的大小接近或超过虚拟内存的最大大小时，就会使用高内存（highmem）。在这一点上，内
-核不可能在任何时候都保持所有可用的物理内存的映射。这意味着内核需要开始使用它想访问的物理内
-存的临时映射。
-
-没有被永久映射覆盖的那部分（物理）内存就是我们所说的 "高内存"。对于这个边界的确切位置，有
-各种架构上的限制。
-
-例如，在i386架构中，我们选择将内核映射到每个进程的虚拟空间，这样我们就不必为内核的进入/退
-出付出全部的TLB作废代价。这意味着可用的虚拟内存空间（i386上为4GiB）必须在用户和内核空间之
-间进行划分。
-
-使用这种方法的架构的传统分配方式是3:1，3GiB用于用户空间，顶部的1GiB用于内核空间。::
-
-		+--------+ 0xffffffff
-		| Kernel |
-		+--------+ 0xc0000000
-		|        |
-		| User   |
-		|        |
-		+--------+ 0x00000000
-
-这意味着内核在任何时候最多可以映射1GiB的物理内存，但是由于我们需要虚拟地址空间来做其他事
-情--包括访问其余物理内存的临时映射--实际的直接映射通常会更少（通常在~896MiB左右）。
-
-其他有mm上下文标签的TLB的架构可以有独立的内核和用户映射。然而，一些硬件（如一些ARM）在使
-用mm上下文标签时，其虚拟空间有限。
-
-
-临时虚拟映射
-============
-
-内核包含几种创建临时映射的方法。:
-
-* vmap().  这可以用来将多个物理页长期映射到一个连续的虚拟空间。它需要synchronization
-  来解除映射。
-
-* kmap().  这允许对单个页面进行短期映射。它需要synchronization，但在一定程度上被摊销。
-  当以嵌套方式使用时，它也很容易出现死锁，因此不建议在新代码中使用它。
-
-* kmap_atomic().  这允许对单个页面进行非常短的时间映射。由于映射被限制在发布它的CPU上，
-  它表现得很好，但发布任务因此被要求留在该CPU上直到它完成，以免其他任务取代它的映射。
-
-  kmap_atomic() 也可以由中断上下文使用，因为它不睡眠，而且调用者可能在调用kunmap_atomic()
-  之后才睡眠。
-
-  可以假设k[un]map_atomic()不会失败。
-
-
-使用kmap_atomic
-===============
-
-何时何地使用 kmap_atomic() 是很直接的。当代码想要访问一个可能从高内存（见__GFP_HIGHMEM）
-分配的页面的内容时，例如在页缓存中的页面，就会使用它。该API有两个函数，它们的使用方式与
-下面类似::
-
-	/* 找到感兴趣的页面。 */
-	struct page *page = find_get_page(mapping, offset);
-
-	/* 获得对该页内容的访问权。 */
-	void *vaddr = kmap_atomic(page);
-
-	/* 对该页的内容做一些处理。 */
-	memset(vaddr, 0, PAGE_SIZE);
-
-	/* 解除该页面的映射。 */
-	kunmap_atomic(vaddr);
-
-注意，kunmap_atomic()调用的是kmap_atomic()调用的结果而不是参数。
-
-如果你需要映射两个页面，因为你想从一个页面复制到另一个页面，你需要保持kmap_atomic调用严
-格嵌套，如::
-
-	vaddr1 = kmap_atomic(page1);
-	vaddr2 = kmap_atomic(page2);
-
-	memcpy(vaddr1, vaddr2, PAGE_SIZE);
-
-	kunmap_atomic(vaddr2);
-	kunmap_atomic(vaddr1);
-
-
-临时映射的成本
-==============
-
-创建临时映射的代价可能相当高。体系架构必须操作内核的页表、数据TLB和/或MMU的寄存器。
-
-如果CONFIG_HIGHMEM没有被设置，那么内核会尝试用一点计算来创建映射，将页面结构地址转换成
-指向页面内容的指针，而不是去捣鼓映射。在这种情况下，解映射操作可能是一个空操作。
-
-如果CONFIG_MMU没有被设置，那么就不可能有临时映射和高内存。在这种情况下，也将使用计算方法。
-
-
-i386 PAE
-========
-
-在某些情况下，i386 架构将允许你在 32 位机器上安装多达 64GiB 的内存。但这有一些后果:
-
-* Linux需要为系统中的每个页面建立一个页帧结构，而且页帧需要驻在永久映射中，这意味着：
-
-* 你最多可以有896M/sizeof(struct page)页帧；由于页结构体是32字节的，所以最终会有
-  112G的页；然而，内核需要在内存中存储更多的页帧......
-
-* PAE使你的页表变大--这使系统变慢，因为更多的数据需要在TLB填充等方面被访问。一个好处
-  是，PAE有更多的PTE位，可以提供像NX和PAT这样的高级功能。
-
-一般的建议是，你不要在32位机器上使用超过8GiB的空间--尽管更多的空间可能对你和你的工作
-量有用，但你几乎是靠你自己--不要指望内核开发者真的会很关心事情的进展情况。
diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst
deleted file mode 100644
index 2379df95aa58..000000000000
--- a/Documentation/translations/zh_CN/vm/hmm.rst
+++ /dev/null
@@ -1,361 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/hmm.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==================
-异构内存管理 (HMM)
-==================
-
-提供基础设施和帮助程序以将非常规内存（设备内存，如板上 GPU 内存）集成到常规内核路径中，其
-基石是此类内存的专用struct page（请参阅本文档的第 5 至 7 节）。
-
-HMM 还为 SVM（共享虚拟内存）提供了可选的帮助程序，即允许设备透明地访问与 CPU 一致的程序
-地址，这意味着 CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用变得
-必不可少，其中 GPU、DSP 或 FPGA 用于代表进程执行各种计算。
-
-本文档分为以下部分：在第一部分中，我揭示了与使用特定于设备的内存分配器相关的问题。在第二
-部分中，我揭示了许多平台固有的硬件限制。第三部分概述了 HMM 设计。第四部分解释了 CPU 页
-表镜像的工作原理以及 HMM 在这种情况下的目的。第五部分处理内核中如何表示设备内存。最后，
-最后一节介绍了一个新的迁移助手，它允许利用设备 DMA 引擎。
-
-.. contents:: :local:
-
-使用特定于设备的内存分配器的问题
-================================
-
-具有大量板载内存（几 GB）的设备（如 GPU）历来通过专用驱动程序特定 API 管理其内存。这会
-造成设备驱动程序分配和管理的内存与常规应用程序内存（私有匿名、共享内存或常规文件支持内存）
-之间的隔断。从这里开始，我将把这个方面称为分割的地址空间。我使用共享地址空间来指代相反的情况：
-即，设备可以透明地使用任何应用程序内存区域。
-
-分割的地址空间的发生是因为设备只能访问通过设备特定 API 分配的内存。这意味着从设备的角度来
-看，程序中的所有内存对象并不平等，这使得依赖于广泛的库的大型程序变得复杂。
-
-具体来说，这意味着想要利用像 GPU 这样的设备的代码需要在通用分配的内存（malloc、mmap
-私有、mmap 共享）和通过设备驱动程序 API 分配的内存之间复制对象（这仍然以 mmap 结束，
-但是是设备文件）。
-
-对于平面数据集（数组、网格、图像……），这并不难实现，但对于复杂数据集（列表、树……），
-很难做到正确。复制一个复杂的数据集需要重新映射其每个元素之间的所有指针关系。这很容易出错，
-而且由于数据集和地址的重复，程序更难调试。
-
-分割地址空间也意味着库不能透明地使用它们从核心程序或另一个库中获得的数据，因此每个库可能
-不得不使用设备特定的内存分配器来重复其输入数据集。大型项目会因此受到影响，并因为各种内存
-拷贝而浪费资源。
-
-复制每个库的API以接受每个设备特定分配器分配的内存作为输入或输出，并不是一个可行的选择。
-这将导致库入口点的组合爆炸。
-
-最后，随着高级语言结构（在 C++ 中，当然也在其他语言中）的进步，编译器现在有可能在没有程
-序员干预的情况下利用 GPU 和其他设备。某些编译器识别的模式仅适用于共享地址空间。对所有
-其他模式，使用共享地址空间也更合理。
-
-
-I/O 总线、设备内存特性
-======================
-
-由于一些限制，I/O 总线削弱了共享地址空间。大多数 I/O 总线只允许从设备到主内存的基本
-内存访问；甚至缓存一致性通常是可选的。从 CPU 访问设备内存甚至更加有限。通常情况下，它
-不是缓存一致的。
-
-如果我们只考虑 PCIE 总线，那么设备可以访问主内存（通常通过 IOMMU）并与 CPU 缓存一
-致。但是，它只允许设备对主存储器进行一组有限的原子操作。这在另一个方向上更糟：CPU
-只能访问有限范围的设备内存，而不能对其执行原子操作。因此，从内核的角度来看，设备内存不
-能被视为与常规内存等同。
-
-另一个严重的因素是带宽有限（约 32GBytes/s，PCIE 4.0 和 16 通道）。这比最快的 GPU
-内存 (1 TBytes/s) 慢 33 倍。最后一个限制是延迟。从设备访问主内存的延迟比设备访问自
-己的内存时高一个数量级。
-
-一些平台正在开发新的 I/O 总线或对 PCIE 的添加/修改以解决其中一些限制
-（OpenCAPI、CCIX）。它们主要允许 CPU 和设备之间的双向缓存一致性，并允许架构支持的所
-有原子操作。遗憾的是，并非所有平台都遵循这一趋势，并且一些主要架构没有针对这些问题的硬
-件解决方案。
-
-因此，为了使共享地址空间有意义，我们不仅必须允许设备访问任何内存，而且还必须允许任何内
-存在设备使用时迁移到设备内存（在迁移时阻止 CPU 访问）。
-
-
-共享地址空间和迁移
-==================
-
-HMM 打算提供两个主要功能。第一个是通过复制cpu页表到设备页表中来共享地址空间，因此对
-于进程地址空间中的任何有效主内存地址，相同的地址指向相同的物理内存。
-
-为了实现这一点，HMM 提供了一组帮助程序来填充设备页表，同时跟踪 CPU 页表更新。设备页表
-更新不像 CPU 页表更新那么容易。要更新设备页表，您必须分配一个缓冲区（或使用预先分配的
-缓冲区池）并在其中写入 GPU 特定命令以执行更新（取消映射、缓存失效和刷新等）。这不能通
-过所有设备的通用代码来完成。因此，为什么HMM提供了帮助器，在把硬件的具体细节留给设备驱
-动程序的同时，把一切可以考虑的因素都考虑进去了。
-
-HMM 提供的第二种机制是一种新的 ZONE_DEVICE 内存，它允许为设备内存的每个页面分配一个
-struct page。这些页面很特殊，因为 CPU 无法映射它们。然而，它们允许使用现有的迁移机
-制将主内存迁移到设备内存，从 CPU 的角度来看，一切看起来都像是换出到磁盘的页面。使用
-struct page可以与现有的 mm 机制进行最简单、最干净的集成。再次，HMM 仅提供帮助程序，
-首先为设备内存热插拔新的 ZONE_DEVICE 内存，然后执行迁移。迁移内容和时间的策略决定留
-给设备驱动程序。
-
-请注意，任何 CPU 对设备页面的访问都会触发缺页异常并迁移回主内存。例如，当支持给定CPU
-地址 A 的页面从主内存页面迁移到设备页面时，对地址 A 的任何 CPU 访问都会触发缺页异常
-并启动向主内存的迁移。
-
-凭借这两个特性，HMM 不仅允许设备镜像进程地址空间并保持 CPU 和设备页表同步，而且还通
-过迁移设备正在使用的数据集部分来利用设备内存。
-
-
-地址空间镜像实现和API
-=====================
-
-地址空间镜像的主要目标是允许将一定范围的 CPU 页表复制到一个设备页表中；HMM 有助于
-保持两者同步。想要镜像进程地址空间的设备驱动程序必须从注册 mmu_interval_notifier
-开始::
-
- int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
-				  struct mm_struct *mm, unsigned long start,
-				  unsigned long length,
-				  const struct mmu_interval_notifier_ops *ops);
-
-在 ops->invalidate() 回调期间，设备驱动程序必须对范围执行更新操作（将范围标记为只
-读，或完全取消映射等）。设备必须在驱动程序回调返回之前完成更新。
-
-当设备驱动程序想要填充一个虚拟地址范围时，它可以使用::
-
-  int hmm_range_fault(struct hmm_range *range);
-
-如果请求写访问，它将在丢失或只读条目上触发缺页异常（见下文）。缺页异常使用通用的 mm 缺
-页异常代码路径，就像 CPU 缺页异常一样。
-
-这两个函数都将 CPU 页表条目复制到它们的 pfns 数组参数中。该数组中的每个条目对应于虚拟
-范围中的一个地址。HMM 提供了一组标志来帮助驱动程序识别特殊的 CPU 页表项。
-
-在 sync_cpu_device_pagetables() 回调中锁定是驱动程序必须尊重的最重要的方面，以保
-持事物正确同步。使用模式是::
-
- int driver_populate_range(...)
- {
-      struct hmm_range range;
-      ...
-
-      range.notifier = &interval_sub;
-      range.start = ...;
-      range.end = ...;
-      range.hmm_pfns = ...;
-
-      if (!mmget_not_zero(interval_sub->notifier.mm))
-          return -EFAULT;
-
- again:
-      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
-      mmap_read_lock(mm);
-      ret = hmm_range_fault(&range);
-      if (ret) {
-          mmap_read_unlock(mm);
-          if (ret == -EBUSY)
-                 goto again;
-          return ret;
-      }
-      mmap_read_unlock(mm);
-
-      take_lock(driver->update);
-      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
-          release_lock(driver->update);
-          goto again;
-      }
-
-      /* Use pfns array content to update device page table,
-       * under the update lock */
-
-      release_lock(driver->update);
-      return 0;
- }
-
-driver->update 锁与驱动程序在其 invalidate() 回调中使用的锁相同。该锁必须在调用
-mmu_interval_read_retry() 之前保持，以避免与并发 CPU 页表更新发生任何竞争。
-
-利用 default_flags 和 pfn_flags_mask
-====================================
-
-hmm_range 结构有 2 个字段，default_flags 和 pfn_flags_mask，它们指定整个范围
-的故障或快照策略，而不必为 pfns 数组中的每个条目设置它们。
-
-例如，如果设备驱动程序需要至少具有读取权限的范围的页面，它会设置::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = 0;
-
-并如上所述调用 hmm_range_fault()。这将填充至少具有读取权限的范围内的所有页面。
-
-现在假设驱动程序想要做同样的事情，除了它想要拥有写权限的范围内的一页。现在驱动程序设
-置::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
-    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
-
-有了这个，HMM 将在至少读取（即有效）的所有页面中异常，并且对于地址
-== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入权限，即，如果
-CPU pte 没有设置写权限，那么HMM将调用handle_mm_fault()。
-
-hmm_range_fault 完成后，标志位被设置为页表的当前状态，即 HMM_PFN_VALID | 如果页
-面可写，将设置 HMM_PFN_WRITE。
-
-
-从核心内核的角度表示和管理设备内存
-==================================
-
-尝试了几种不同的设计来支持设备内存。第一个使用特定于设备的数据结构来保存有关迁移内存
-的信息，HMM 将自身挂接到 mm 代码的各个位置，以处理对设备内存支持的地址的任何访问。
-事实证明，这最终复制了 struct page 的大部分字段，并且还需要更新许多内核代码路径才
-能理解这种新的内存类型。
-
-大多数内核代码路径从不尝试访问页面后面的内存，而只关心struct page的内容。正因为如此，
-HMM 切换到直接使用 struct page 用于设备内存，这使得大多数内核代码路径不知道差异。
-我们只需要确保没有人试图从 CPU 端映射这些页面。
-
-移入和移出设备内存
-==================
-
-由于 CPU 无法直接访问设备内存，因此设备驱动程序必须使用硬件 DMA 或设备特定的加载/存
-储指令来迁移数据。migrate_vma_setup()、migrate_vma_pages() 和
-migrate_vma_finalize() 函数旨在使驱动程序更易于编写并集中跨驱动程序的通用代码。
-
-在将页面迁移到设备私有内存之前，需要创建特殊的设备私有 ``struct page`` 。这些将用
-作特殊的“交换”页表条目，以便 CPU 进程在尝试访问已迁移到设备专用内存的页面时会发生异常。
-
-这些可以通过以下方式分配和释放::
-
-    struct resource *res;
-    struct dev_pagemap pagemap;
-
-    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
-                                  "name of driver resource");
-    pagemap.type = MEMORY_DEVICE_PRIVATE;
-    pagemap.range.start = res->start;
-    pagemap.range.end = res->end;
-    pagemap.nr_range = 1;
-    pagemap.ops = &device_devmem_ops;
-    memremap_pages(&pagemap, numa_node_id());
-
-    memunmap_pages(&pagemap);
-    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
-
-还有devm_request_free_mem_region(), devm_memremap_pages(),
-devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``struct device``.
-
-整体迁移步骤类似于在系统内存中迁移 NUMA 页面(see :ref:`Page migration <page_migration>`) ，
-但这些步骤分为设备驱动程序特定代码和共享公共代码:
-
-1. ``mmap_read_lock()``
-
-   设备驱动程序必须将 ``struct vm_area_struct`` 传递给migrate_vma_setup()，
-   因此需要在迁移期间保留 mmap_read_lock() 或 mmap_write_lock()。
-
-2. ``migrate_vma_setup(struct migrate_vma *args)``
-
-   设备驱动初始化了 ``struct migrate_vma`` 的字段，并将该指针传递给
-   migrate_vma_setup()。``args->flags`` 字段是用来过滤哪些源页面应该被迁移。
-   例如，设置 ``MIGRATE_VMA_SELECT_SYSTEM`` 将只迁移系统内存，设置
-   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` 将只迁移驻留在设备私有内存中的页
-   面。如果后者被设置， ``args->pgmap_owner`` 字段被用来识别驱动所拥有的设备
-   私有页。这就避免了试图迁移驻留在其他设备中的设备私有页。目前，只有匿名的私有VMA
-   范围可以被迁移到系统内存和设备私有内存。
-
-   migrate_vma_setup()所做的第一步是用 ``mmu_notifier_invalidate_range_start()``
-   和 ``mmu_notifier_invalidate_range_end()`` 调用来遍历设备周围的页表，使
-   其他设备的MMU无效，以便在 ``args->src`` 数组中填写要迁移的PFN。
-   ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` ，
-   其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE， ``owner`` 字段设置为传递给
-   migrate_vma_setup()的 ``args->pgmap_owner`` 字段。这允许设备驱动跳过无
-   效化回调，只无效化那些实际正在迁移的设备私有MMU映射。这一点将在下一节详细解释。
-
-
-   在遍历页表时，一个 ``pte_none()`` 或 ``is_zero_pfn()`` 条目导致一个有效
-   的  “zero” PFN 存储在 ``args->src`` 阵列中。这让驱动分配设备私有内存并清
-   除它，而不是复制一个零页。到系统内存或设备私有结构页的有效PTE条目将被
-   ``lock_page()``锁定，与LRU隔离（如果系统内存和设备私有页不在LRU上），从进
-   程中取消映射，并插入一个特殊的迁移PTE来代替原来的PTE。 migrate_vma_setup()
-   还清除了 ``args->dst`` 数组。
-
-3. 设备驱动程序分配目标页面并将源页面复制到目标页面。
-
-   驱动程序检查每个 ``src`` 条目以查看该 ``MIGRATE_PFN_MIGRATE`` 位是否已
-   设置并跳过未迁移的条目。设备驱动程序还可以通过不填充页面的 ``dst`` 数组来选
-   择跳过页面迁移。
-
-   然后，驱动程序分配一个设备私有 struct page 或一个系统内存页，用 ``lock_page()``
-   锁定该页，并将 ``dst`` 数组条目填入::
-
-     dst[i] = migrate_pfn(page_to_pfn(dpage));
-
-   现在驱动程序知道这个页面正在被迁移，它可以使设备私有 MMU 映射无效并将设备私有
-   内存复制到系统内存或另一个设备私有页面。由于核心 Linux 内核会处理 CPU 页表失
-   效，因此设备驱动程序只需使其自己的 MMU 映射失效。
-
-   驱动程序可以使用 ``migrate_pfn_to_page(src[i])`` 来获取源设备的
-   ``struct page`` 面，并将源页面复制到目标设备上，如果指针为 ``NULL`` ，意
-   味着源页面没有被填充到系统内存中，则清除目标设备的私有内存。
-
-4. ``migrate_vma_pages()``
-
-   这一步是实际“提交”迁移的地方。
-
-   如果源页是 ``pte_none()`` 或 ``is_zero_pfn()`` 页，这时新分配的页会被插
-   入到CPU的页表中。如果一个CPU线程在同一页面上发生异常，这可能会失败。然而，页
-   表被锁定，只有一个新页会被插入。如果它失去了竞争，设备驱动将看到
-   ``MIGRATE_PFN_MIGRATE`` 位被清除。
-
-   如果源页被锁定、隔离等，源 ``struct page`` 信息现在被复制到目标
-   ``struct page`` ，最终完成CPU端的迁移。
-
-5. 设备驱动为仍在迁移的页面更新设备MMU页表，回滚未迁移的页面。
-
-   如果 ``src`` 条目仍然有 ``MIGRATE_PFN_MIGRATE`` 位被设置，设备驱动可以
-   更新设备MMU，如果 ``MIGRATE_PFN_WRITE`` 位被设置，则设置写启用位。
-
-6. ``migrate_vma_finalize()``
-
-   这一步用新页的页表项替换特殊的迁移页表项，并释放对源和目的 ``struct page``
-   的引用。
-
-7. ``mmap_read_unlock()``
-
-   现在可以释放锁了。
-
-独占访问存储器
-==============
-
-一些设备具有诸如原子PTE位的功能，可以用来实现对系统内存的原子访问。为了支持对一
-个共享的虚拟内存页的原子操作，这样的设备需要对该页的访问是排他的，而不是来自CPU
-的任何用户空间访问。  ``make_device_exclusive_range()`` 函数可以用来使一
-个内存范围不能从用户空间访问。
-
-这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
-导致一个异常，该异常会通过用原始映射替换该条目而得到恢复。驱动程序会被通知映射已
-经被MMU通知器改变，之后它将不再有对该页的独占访问。独占访问被保证持续到驱动程序
-放弃页面锁和页面引用为止，这时页面上的任何CPU异常都可以按所述进行。
-
-内存 cgroup (memcg) 和 rss 统计
-===============================
-
-目前，设备内存被视为 rss 计数器中的任何常规页面（如果设备页面用于匿名，则为匿名，
-如果设备页面用于文件支持页面，则为文件，如果设备页面用于共享内存，则为 shmem）。
-这是为了保持现有应用程序的故意选择，这些应用程序可能在不知情的情况下开始使用设备
-内存，运行不受影响。
-
-一个缺点是 OOM 杀手可能会杀死使用大量设备内存而不是大量常规系统内存的应用程序，
-因此不会释放太多系统内存。在决定以不同方式计算设备内存之前，我们希望收集更多关
-于应用程序和系统在存在设备内存的情况下在内存压力下如何反应的实际经验。
-
-对内存 cgroup 做出了相同的决定。设备内存页面根据相同的内存 cgroup 计算，常规
-页面将被计算在内。这确实简化了进出设备内存的迁移。这也意味着从设备内存迁移回常规
-内存不会失败，因为它会超过内存 cgroup 限制。一旦我们对设备内存的使用方式及其对
-内存资源控制的影响有了更多的了解，我们可能会在后面重新考虑这个选择。
-
-请注意，设备内存永远不能由设备驱动程序或通过 GUP 固定，因此此类内存在进程退出时
-总是被释放的。或者在共享内存或文件支持内存的情况下，当删除最后一个引用时。
diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
deleted file mode 100644
index c6d471ce2131..000000000000
--- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
+++ /dev/null
@@ -1,436 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/hugetlbfs_reserv.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-==============
-Hugetlbfs 预留
-==============
-
-概述
-====
-
-:ref:`hugetlbpage` 中描述的巨页通常是预先分配给应用程序使用的。如果VMA指
-示要使用巨页，这些巨页会在缺页异常时被实例化到任务的地址空间。如果在缺页异常
-时没有巨页存在，任务就会被发送一个SIGBUS，并经常不高兴地死去。在加入巨页支
-持后不久，人们决定，在mmap()时检测巨页的短缺情况会更好。这个想法是，如果
-没有足够的巨页来覆盖映射，mmap()将失败。这首先是在mmap()时在代码中做一个
-简单的检查，以确定是否有足够的空闲巨页来覆盖映射。就像内核中的大多数东西一
-样，代码随着时间的推移而不断发展。然而，基本的想法是在mmap()时 “预留”
-巨页，以确保巨页可以用于该映射中的缺页异常。下面的描述试图描述在v4.10内核
-中是如何进行巨页预留处理的。
-
-
-读者
-====
-这个描述主要是针对正在修改hugetlbfs代码的内核开发者。
-
-
-数据结构
-========
-
-resv_huge_pages
-	这是一个全局的（per-hstate）预留的巨页的计数。预留的巨页只对预留它们的任
-	务可用。因此，一般可用的巨页的数量被计算为(``free_huge_pages - resv_huge_pages``)。
-Reserve Map
-	预留映射由以下结构体描述::
-
-		struct resv_map {
-			struct kref refs;
-			spinlock_t lock;
-			struct list_head regions;
-			long adds_in_progress;
-			struct list_head region_cache;
-			long region_cache_count;
-		};
-
-	系统中每个巨页映射都有一个预留映射。resv_map中的regions列表描述了映射中的
-	区域。一个区域被描述为::
-
-		struct file_region {
-			struct list_head link;
-			long from;
-			long to;
-		};
-
-	file_region结构体的 ‘from’ 和 ‘to’ 字段是进入映射的巨页索引。根据映射的类型，在
-	reserv_map 中的一个区域可能表示该范围存在预留，或预留不存在。
-Flags for MAP_PRIVATE Reservations
-	这些被存储在预留的映射指针的底部。
-
-	``#define HPAGE_RESV_OWNER    (1UL << 0)``
-		表示该任务是与该映射相关的预留的所有者。
-	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
-		表示最初映射此范围（并创建储备）的任务由于COW失败而从该任务（子任务）中取消映
-		射了一个页面。
-Page Flags
-	PagePrivate页面标志是用来指示在释放巨页时必须恢复巨页的预留。更多细节将在
-	“释放巨页” 一节中讨论。
-
-
-预留映射位置（私有或共享）
-==========================
-
-一个巨页映射或段要么是私有的，要么是共享的。如果是私有的，它通常只对一个地址空间
-（任务）可用。如果是共享的，它可以被映射到多个地址空间（任务）。对于这两种类型的映射，
-预留映射的位置和语义是明显不同的。位置的差异是：
-
-- 对于私有映射，预留映射挂在VMA结构体上。具体来说，就是vma->vm_private_data。这个保
-  留映射是在创建映射（mmap(MAP_PRIVATE)）时创建的。
-- 对于共享映射，预留映射挂在inode上。具体来说，就是inode->i_mapping->private_data。
-  由于共享映射总是由hugetlbfs文件系统中的文件支持，hugetlbfs代码确保每个节点包含一个预
-  留映射。因此，预留映射在创建节点时被分配。
-
-
-创建预留
-========
-当创建一个巨大的有页面支持的共享内存段（shmget(SHM_HUGETLB)）或通过mmap(MAP_HUGETLB)
-创建一个映射时，就会创建预留。这些操作会导致对函数hugetlb_reserve_pages()的调用::
-
-	int hugetlb_reserve_pages(struct inode *inode,
-				  long from, long to,
-				  struct vm_area_struct *vma,
-				  vm_flags_t vm_flags)
-
-hugetlb_reserve_pages()做的第一件事是检查在调用shmget()或mmap()时是否指定了NORESERVE
-标志。如果指定了NORESERVE，那么这个函数立即返回，因为不需要预留。
-
-参数'from'和'to'是映射或基础文件的巨页索引。对于shmget()，'from'总是0，'to'对应于段/映射
-的长度。对于mmap()，offset参数可以用来指定进入底层文件的偏移量。在这种情况下，'from'和'to'
-参数已经被这个偏移量所调整。
-
-PRIVATE和SHARED映射之间的一个很大的区别是预留在预留映射中的表示方式。
-
-- 对于共享映射，预留映射中的条目表示对应页面的预留存在或曾经存在。当预留被消耗时，预留映射不被
-  修改。
-- 对于私有映射，预留映射中没有条目表示相应页面存在预留。随着预留被消耗，条目被添加到预留映射中。
-  因此，预留映射也可用于确定哪些预留已被消耗。
-
-对于私有映射，hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外，
-HPAGE_RESV_OWNER标志被设置，以表明该VMA拥有预留。
-
-预留映射被查阅以确定当前映射/段需要多少巨页预留。对于私有映射，这始终是一个值（to - from）。
-然而，对于共享映射来说，一些预留可能已经存在于(to - from)的范围内。关于如何实现这一点的细节，
-请参见 :ref:`预留映射的修改 <resv_map_modifications>` 一节。
-
-该映射可能与一个子池（subpool）相关联。如果是这样，将查询子池以确保有足够的空间用于映射。子池
-有可能已经预留了可用于映射的预留空间。更多细节请参见 :ref: `子池预留 <sub_pool_resv>`
-一节。
-
-在咨询了预留映射和子池之后，就知道了需要的新预留数量。hugetlb_acct_memory()函数被调用以检查
-并获取所要求的预留数量。hugetlb_acct_memory()调用到可能分配和调整剩余页数的函数。然而，在这
-些函数中，代码只是检查以确保有足够的空闲的巨页来容纳预留。如果有的话，全局预留计数resv_huge_pages
-会被调整，如下所示::
-
-	if (resv_needed <= (resv_huge_pages - free_huge_pages))
-		resv_huge_pages += resv_needed;
-
-注意，在检查和调整这些计数器时，全局锁hugetlb_lock会被预留。
-
-如果有足够的空闲的巨页，并且全局计数resv_huge_pages被调整，那么与映射相关的预留映射被修改以
-反映预留。在共享映射的情况下，将存在一个file_region，包括'from'-'to'范围。对于私有映射，
-不对预留映射进行修改，因为没有条目表示存在预留。
-
-如果hugetlb_reserve_pages()成功，全局预留数和与映射相关的预留映射将根据需要被修改，以确保
-在'from'-'to'范围内存在预留。
-
-消耗预留/分配一个巨页
-===========================
-
-当与预留相关的巨页在相应的映射中被分配和实例化时，预留就被消耗了。该分配是在函数alloc_huge_page()
-中进行的::
-
-	struct page *alloc_huge_page(struct vm_area_struct *vma,
-				     unsigned long addr, int avoid_reserve)
-
-alloc_huge_page被传递给一个VMA指针和一个虚拟地址，因此它可以查阅预留映射以确定是否存在预留。
-此外，alloc_huge_page需要一个参数avoid_reserve，该参数表示即使看起来已经为指定的地址预留了
-预留，也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下，即现有页面的额
-外拷贝被分配。
-
-
-调用辅助函数vma_needs_reservation()来确定是否存在对映射(vma)中地址的预留。关于这个函数的详
-细内容，请参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。从
-vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留，则为0，如果不存在预留，则为1。
-如果不存在预留，并且有一个与映射相关联的子池，则查询子池以确定它是否包含预留。如果子池包含预留，
-则可将其中一个用于该分配。然而，在任何情况下，avoid_reserve参数都会优先考虑为分配使用预留。在
-确定预留是否存在并可用于分配后，调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关
-的参数：
-
-- avoid_reserve，这是传递给alloc_huge_page()的同一个值/参数。
-- chg，尽管这个参数的类型是long，但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0，
-  则表明存在预留（关于可能的问题，请参见 “预留和内存策略” 一节）。如果值
-  为1，则表示不存在预留，如果可能的话，必须从全局空闲池中取出该页。
-
-与VMA的内存策略相关的空闲列表被搜索到一个空闲页。如果找到了一个页面，当该页面从空闲列表中移除时，
-free_huge_pages的值被递减。如果有一个与该页相关的预留，将进行以下调整::
-
-	SetPagePrivate(page);	/* 表示分配这个页面消耗了一个预留，
-				 * 如果遇到错误，以至于必须释放这个页面，预留将被
-				 * 恢复。 */
-	resv_huge_pages--;	/* 减少全局预留计数 */
-
-注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
-的剩余巨页和超额分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整:
-SetPagePrivate(page) 和 resv_huge_pages--.
-
-在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
-面被释放时，这将被用于子池的计数。
-
-然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
-到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
-已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建一
-个新的条目。
-
-注意，如果找不到满足VMA内存策略的巨页，将尝试使用伙伴分配器分配一个。这就带来了超出预留范围
-的剩余巨页和过度分配的问题。即使分配了一个多余的页面，也会进行与上面一样的基于预留的调整。
-SetPagePrivate(page)和resv_huge_pages-。
-
-在获得一个新的巨页后，(page)->private被设置为与该页面相关的子池的值，如果它存在的话。当页
-面被释放时，这将被用于子池的计数。
-
-然后调用函数vma_commit_reservation()，根据预留的消耗情况调整预留映射。一般来说，这涉及
-到确保页面在区域映射的file_region结构体中被表示。对于预留存在的共享映射，预留映射中的条目
-已经存在，所以不做任何改变。然而，如果共享映射中没有预留，或者这是一个私有映射，则必须创建
-一个新的条目。
-
-在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用
-vma_commit_reservation()之间，预留映射有可能被改变。如果hugetlb_reserve_pages在共
-享映射中为同一页面被调用，这将是可能的。在这种情况下，预留计数和子池空闲页计数会有一个偏差。
-这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来
-识别。如果检测到这种竞争，子池和全局预留计数将被调整以进行补偿。关于这些函数的更多信息，请
-参见 :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。
-
-
-实例化巨页
-==========
-
-在巨页分配之后，页面通常被添加到分配任务的页表中。在此之前，共享映射中的页面被添加到页面缓
-存中，私有映射中的页面被添加到匿名反向映射中。在这两种情况下，PagePrivate标志被清除。因此，
-当一个已经实例化的巨页被释放时，不会对全局预留计数（resv_huge_pages）进行调整。
-
-
-释放巨页
-========
-
-巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此，它只传
-递一个指向页面结构体的指针。当一个巨页被释放时，可能需要进行预留计算。如果该页与包含保
-留的子池相关联，或者该页在错误路径上被释放，必须恢复全局预留计数，就会出现这种情况。
-
-page->private字段指向与该页相关的任何子池。如果PagePrivate标志被设置，它表明全局预留计数
-应该被调整（关于如何设置这些标志的信息，请参见
-:ref: `消耗预留/分配一个巨页 <consume_resv>` ）。
-
-
-该函数首先调用hugepage_subpool_put_pages()来处理该页。如果这个函数返回一个0的值（不等于
-传递的1的值），它表明预留与子池相关联，这个新释放的页面必须被用来保持子池预留的数量超过最小值。
-因此，在这种情况下，全局resv_huge_pages计数器被递增。
-
-如果页面中设置了PagePrivate标志，那么全局resv_huge_pages计数器将永远被递增。
-
-子池预留
-========
-
-有一个结构体hstate与每个巨页尺寸相关联。hstate跟踪所有指定大小的巨页。一个子池代表一
-个hstate中的页面子集，它与一个已挂载的hugetlbfs文件系统相关
-
-当一个hugetlbfs文件系统被挂载时，可以指定min_size选项，它表示文件系统所需的最小的巨页数量。
-如果指定了这个选项，与min_size相对应的巨页的数量将被预留给文件系统使用。这个数字在结构体
-hugepage_subpool的min_hpages字段中被跟踪。在挂载时，hugetlb_acct_memory(min_hpages)
-被调用以预留指定数量的巨页。如果它们不能被预留，挂载就会失败。
-
-当从子池中获取或释放页面时，会调用hugepage_subpool_get/put_pages()函数。
-hugepage_subpool_get/put_pages被传递给巨页数量，以此来调整子池的 “已用页面” 计数
-（get为下降，put为上升）。通常情况下，如果子池中没有足够的页面，它们会返回与传递的相同的值或
-一个错误。
-
-然而，如果预留与子池相关联，可能会返回一个小于传递值的返回值。这个返回值表示必须进行的额外全局
-池调整的数量。例如，假设一个子池包含3个预留的巨页，有人要求5个。与子池相关的3个预留页可以用来
-满足部分请求。但是，必须从全局池中获得2个页面。为了向调用者转达这一信息，将返回值2。然后，调用
-者要负责从全局池中获取另外两个页面。
-
-
-COW和预留
-==========
-
-由于共享映射都指向并使用相同的底层页面，COW最大的预留问题是私有映射。在这种情况下，两个任务可
-以指向同一个先前分配的页面。一个任务试图写到该页，所以必须分配一个新的页，以便每个任务都指向它
-自己的页。
-
-当该页最初被分配时，该页的预留被消耗了。当由于COW而试图分配一个新的页面时，有可能没有空闲的巨
-页，分配会失败。
-
-当最初创建私有映射时，通过设置所有者的预留映射指针中的HPAGE_RESV_OWNER位来标记映射的所有者。
-由于所有者创建了映射，所有者拥有与映射相关的所有预留。因此，当一个写异常发生并且没有可用的页面
-时，对预留的所有者和非所有者采取不同的行动。
-
-在发生异常的任务不是所有者的情况下，异常将失败，该任务通常会收到一个SIGBUS。
-
-如果所有者是发生异常的任务，我们希望它能够成功，因为它拥有原始的预留。为了达到这个目的，该页被
-从非所有者任务中解映射出来。这样一来，唯一的引用就是来自拥有者的任务。此外，HPAGE_RESV_UNMAPPED
-位被设置在非拥有任务的预留映射指针中。如果非拥有者任务后来在一个不存在的页面上发生异常，它可能
-会收到一个SIGBUS。但是，映射/预留的原始拥有者的行为将与预期一致。
-
-预留映射的修改
-==============
-
-以下低级函数用于对预留映射进行修改。通常情况下，这些函数不会被直接调用。而是调用一个预留映射辅
-助函数，该函数调用这些低级函数中的一个。这些低级函数在源代码（mm/hugetlb.c）中得到了相当好的
-记录。这些函数是::
-
-	long region_chg(struct resv_map *resv, long f, long t);
-	long region_add(struct resv_map *resv, long f, long t);
-	void region_abort(struct resv_map *resv, long f, long t);
-	long region_count(struct resv_map *resv, long f, long t);
-
-在预留映射上的操作通常涉及两个操作:
-
-1) region_chg()被调用来检查预留映射，并确定在指定的范围[f, t]内有多少页目前没有被代表。
-
-   调用代码执行全局检查和分配，以确定是否有足够的巨页使操作成功。
-
-2)
-  a) 如果操作能够成功，regi_add()将被调用，以实际修改先前传递给regi_chg()的相同范围
-     [f, t]的预留映射。
-  b) 如果操作不能成功，region_abort被调用，在相同的范围[f, t]内中止操作。
-
-注意，这是一个两步的过程， region_add()和 region_abort()在事先调用 region_chg()后保证
-成功。 region_chg()负责预先分配任何必要的数据结构以确保后续操作（特别是 region_add()）的
-成功。
-
-如上所述，region_chg()确定该范围内当前没有在映射中表示的页面的数量。region_add()返回添加
-到映射中的范围内的页数。在大多数情况下， region_add() 的返回值与 region_chg() 的返回值相
-同。然而，在共享映射的情况下，有可能在调用 region_chg() 和 region_add() 之间对预留映射进
-行更改。在这种情况下，regi_add()的返回值将与regi_chg()的返回值不符。在这种情况下，全局计数
-和子池计数很可能是不正确的，需要调整。检查这种情况并进行适当的调整是调用者的责任。
-
-函数region_del()被调用以从预留映射中移除区域。
-它通常在以下情况下被调用:
-
-- 当hugetlbfs文件系统中的一个文件被删除时，该节点将被释放，预留映射也被释放。在释放预留映射
-  之前，所有单独的file_region结构体必须被释放。在这种情况下，region_del的范围是[0, LONG_MAX]。
-- 当一个hugetlbfs文件正在被截断时。在这种情况下，所有在新文件大小之后分配的页面必须被释放。
-  此外，预留映射中任何超过新文件大小的file_region条目必须被删除。在这种情况下，region_del
-  的范围是[new_end_of_file, LONG_MAX]。
-- 当在一个hugetlbfs文件中打洞时。在这种情况下，巨页被一次次从文件的中间移除。当这些页被移除
-  时，region_del()被调用以从预留映射中移除相应的条目。在这种情况下，region_del被传递的范
-  围是[page_idx, page_idx + 1]。
-
-在任何情况下，region_del()都会返回从预留映射中删除的页面数量。在非常罕见的情况下，region_del()
-会失败。这只能发生在打洞的情况下，即它必须分割一个现有的file_region条目，而不能分配一个新的
-结构体。在这种错误情况下，region_del()将返回-ENOMEM。这里的问题是，预留映射将显示对该页有
-预留。然而，子池和全局预留计数将不反映该预留。为了处理这种情况，调用函数hugetlb_fix_reserve_counts()
-来调整计数器，使其与不能被删除的预留映射条目相对应。
-
-region_count()在解除私有巨页映射时被调用。在私有映射中，预留映射中没有条目表明存在一个预留。
-因此，通过计算预留映射中的条目数，我们知道有多少预留被消耗了，有多少预留是未完成的
-（Outstanding = (end - start) - region_count（resv, start, end））。由于映射正在消
-失，子池和全局预留计数被未完成的预留数量所减去。
-
-预留映射帮助函数
-================
-
-有几个辅助函数可以查询和修改预留映射。这些函数只对特定的巨页的预留感兴趣，所以它们只是传入一个
-地址而不是一个范围。此外，它们还传入相关的VMA。从VMA中，可以确定映射的类型（私有或共享）和预留
-映射的位置（inode或VMA）。这些函数只是调用 “预留映射的修改” 一节中描述的基础函数。然而，
-它们确实考虑到了私有和共享映射的预留映射条目的 “相反” 含义，并向调用者隐藏了这个细节::
-
-	long vma_needs_reservation(struct hstate *h,
-				   struct vm_area_struct *vma,
-				   unsigned long addr)
-
-该函数为指定的页面调用 region_chg()。如果不存在预留，则返回1。如果存在预留，则返回0::
-
-	long vma_commit_reservation(struct hstate *h,
-				    struct vm_area_struct *vma,
-				    unsigned long addr)
-
-这将调用 region_add()，用于指定的页面。与region_chg和region_add的情况一样，该函数应在
-先前调用的vma_needs_reservation后调用。它将为该页添加一个预留条目。如果预留被添加，它将
-返回1，如果没有则返回0。返回值应与之前调用vma_needs_reservation的返回值进行比较。如果出
-现意外的差异，说明在两次调用之间修改了预留映射::
-
-	void vma_end_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-这将调用指定页面的 region_abort()。与region_chg和region_abort的情况一样，该函数应在
-先前调用的vma_needs_reservation后被调用。它将中止/结束正在进行的预留添加操作::
-
-	long vma_add_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-这是一个特殊的包装函数，有助于在错误路径上清理预留。它只从repare_reserve_on_error()函数
-中调用。该函数与vma_needs_reservation一起使用，试图将一个预留添加到预留映射中。它考虑到
-了私有和共享映射的不同预留映射语义。因此，region_add被调用用于共享映射（因为映射中的条目表
-示预留），而region_del被调用用于私有映射（因为映射中没有条目表示预留）。关于在错误路径上需
-要做什么的更多信息，请参见  “错误路径中的预留清理”  。
-
-
-错误路径中的预留清理
-====================
-
-正如在:ref:`预留映射帮助函数<resv_map_helpers>` 一节中提到的，预留的修改分两步进行。首
-先，在分配页面之前调用vma_needs_reservation。如果分配成功，则调用vma_commit_reservation。
-如果不是，则调用vma_end_reservation。全局和子池的预留计数根据操作的成功或失败进行调整，
-一切都很好。
-
-此外，在一个巨页被实例化后，PagePrivate标志被清空，这样，当页面最终被释放时，计数是
-正确的。
-
-然而，有几种情况是，在一个巨页被分配后，但在它被实例化之前，就遇到了错误。在这种情况下，
-页面分配已经消耗了预留，并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放
-（在实例化和清除PagePrivate之前），那么free_huge_page将增加全局预留计数。然而，预留映射
-显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高，
-并阻止分配一个预先分配的页面。
-
-函数 restore_reserve_on_error() 试图处理这种情况。它有相当完善的文档。这个函数的目的
-是将预留映射恢复到页面分配前的状态。通过这种方式，预留映射的状态将与页面释放后的全局预留计
-数相对应。
-
-函数restore_reserve_on_error本身在试图恢复预留映射条目时可能会遇到错误。在这种情况下，
-它将简单地清除该页的PagePrivate标志。这样一来，当页面被释放时，全局预留计数将不会被递增。
-然而，预留映射将继续看起来像预留被消耗了一样。一个页面仍然可以被分配到该地址，但它不会像最
-初设想的那样使用一个预留页。
-
-有一些代码（最明显的是userfaultfd）不能调用restore_reserve_on_error。在这种情况下，
-它简单地修改了PagePrivate，以便在释放巨页时不会泄露预留。
-
-
-预留和内存策略
-==============
-当git第一次被用来管理Linux代码时，每个节点的巨页列表就存在于hstate结构中。预留的概念是
-在一段时间后加入的。当预留被添加时，没有尝试将内存策略考虑在内。虽然cpusets与内存策略不
-完全相同，但hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作
-用::
-
-
-	/*
-	 * 当cpuset被配置时，它打破了严格的hugetlb页面预留，因为计数是在一个全局变量上完
-	 * 成的。在有cpuset的情况下，这样的预留完全是垃圾，因为预留没有根据当前cpuset的
-	 * 页面可用性来检查。在任务所在的cpuset中缺乏空闲的htlb页面时，应用程序仍然有可能
-	 * 被内核OOM'ed。试图用cpuset来执行严格的计数几乎是不可能的（或者说太难看了），因
-	 * 为cpuset太不稳定了，任务或内存节点可以在cpuset之间动态移动。与cpuset共享
-	 * hugetlb映射的语义变化是不可取的。然而，为了预留一些语义，我们退回到检查当前空闲
-	 * 页的可用性，作为一种最好的尝试，希望能将cpuset改变语义的影响降到最低。
-	 */
-
-添加巨页预留是为了防止在缺页异常时出现意外的页面分配失败（OOM）。然而，如果一个应用
-程序使用cpusets或内存策略，就不能保证在所需的节点上有巨页可用。即使有足够数量的全局
-预留，也是如此。
-
-Hugetlbfs回归测试
-=================
-
-最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代码，请使用
-libhugetlbfs测试套件来检查回归情况。此外，如果你添加了任何新的hugetlb功能，请在
-libhugetlbfs中添加适当的测试。
-
---
-Mike Kravetz，2017年4月7日
diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst
deleted file mode 100644
index c6e1e7bdb05b..000000000000
--- a/Documentation/translations/zh_CN/vm/hwpoison.rst
+++ /dev/null
@@ -1,166 +0,0 @@
-
-:Original: Documentation/vm/hwpoison.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-hwpoison
-========
-
-什么是hwpoison?
-===============
-
-
-即将推出的英特尔CPU支持从一些内存错误中恢复（ ``MCA恢复`` ）。这需要操作系统宣布
-一个页面"poisoned"，杀死与之相关的进程，并避免在未来使用它。
-
-这个补丁包在虚拟机中实现了必要的(编程)框架。
-
-引用概述中的评论::
-
-	高级机器的检查与处理。处理方法是损坏的页面被硬件报告，通常是由于2位ECC内
-	存或高速缓存故障。
-
-	这主要是针对在后台检测到的损坏的页面。当当前的CPU试图访问它时，当前运行的进程
-	可以直接被杀死。因为还没有访问损坏的页面, 如果错误由于某种原因不能被处理，就可
-	以安全地忽略它. 而不是用另外一个机器检查去处理它。
-
-	处理不同状态的页面缓存页。这里棘手的部分是，相对于其他虚拟内存用户， 我们可以异
-	步访问任何页面。因为内存故障可能随时随地发生，可能违反了他们的一些假设。这就是
-	为什么这段代码必须非常小心。一般来说，它试图使用正常的锁规则，如获得标准锁，即使
-	这意味着错误处理可能需要很长的时间。
-
-	这里的一些操作有点低效，并且具有非线性的算法复杂性，因为数据结构没有针对这种情
-	况进行优化。特别是从vma到进程的映射就是这种情况。由于这种情况大概率是罕见的，所
-	以我们希望我们可以摆脱这种情况。
-
-该代码由mm/memory-failure.c中的高级处理程序、一个新的页面poison位和虚拟机中的
-各种检查组成，用来处理poison的页面。
-
-现在主要目标是KVM客户机，但它适用于所有类型的应用程序。支持KVM需要最近的qemu-kvm
-版本。
-
-对于KVM的使用，需要一个新的信号类型，这样KVM就可以用适当的地址将机器检查注入到客户
-机中。这在理论上也允许其他应用程序处理内存故障。我们的期望是，所有的应用程序都不要这
-样做，但一些非常专业的应用程序可能会这样做。
-
-故障恢复模式
-============
-
-有两种（实际上是三种）模式的内存故障恢复可以在。
-
-vm.memory_failure_recovery sysctl 置零:
-	所有的内存故障都会导致panic。请不要尝试恢复。
-
-早期处理
-	(可以在全局和每个进程中控制) 一旦检测到错误，立即向应用程序发送SIGBUS这允许
-	应用程序以温和的方式处理内存错误（例如，放弃受影响的对象） 这是KVM qemu使用的
-	模式。
-
-推迟处理
-	当应用程序运行到损坏的页面时，发送SIGBUS。这对不知道内存错误的应用程序来说是
-	最好的，默认情况下注意一些页面总是被当作late kill处理。
-
-用户控制
-========
-
-vm.memory_failure_recovery
-	参阅 sysctl.txt
-
-vm.memory_failure_early_kill
-	全局启用early kill
-
-PR_MCE_KILL
-	设置early/late kill mode/revert 到系统默认值。
-
-	arg1: PR_MCE_KILL_CLEAR:
-		恢复到系统默认值
-	arg1: PR_MCE_KILL_SET:
-		arg2定义了线程特定模式
-
-		PR_MCE_KILL_EARLY:
-			Early kill
-		PR_MCE_KILL_LATE:
-			Late kill
-		PR_MCE_KILL_DEFAULT
-			使用系统全局默认值
-
-	注意，如果你想有一个专门的线程代表进程处理SIGBUS(BUS_MCEERR_AO)，你应该在
-	指定线程上调用prctl(PR_MCE_KILL_EARLY)。否则，SIGBUS将被发送到主线程。
-
-PR_MCE_KILL_GET
-	返回当前模式
-
-测试
-====
-
-* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页面
-
-* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模块
-
-  corrupt-pfn
-	在PFN处注入hwpoison故障，并echoed到这个文件。这做了一些早期过滤，以避
-	免在测试套件中损坏非预期页面。
-  unpoison-pfn
-	在PFN的Software-unpoison页面对应到这个文件。这样，一个页面可以再次被
-	复用。这只对Linux注入的故障起作用，对真正的内存故障不起作用。
-
-  注意这些注入接口并不稳定，可能会在不同的内核版本中发生变化
-
-  corrupt-filter-dev-major, corrupt-filter-dev-minor
-	只处理与块设备major/minor定义的文件系统相关的页面的内存故障。-1U是通
-	配符值。这应该只用于人工注入的测试。
-
-  corrupt-filter-memcg
-	限制注入到memgroup拥有的页面。由memcg的inode号指定。
-
-	Example::
-
-		mkdir /sys/fs/cgroup/mem/hwpoison
-
-	        usemem -m 100 -s 1000 &
-		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-
-		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-
-		page-types -p `pidof init`   --hwpoison  # shall do nothing
-		page-types -p `pidof usemem` --hwpoison  # poison its pages
-
-  corrupt-filter-flags-mask, corrupt-filter-flags-value
-	当指定时，只有在((page_flags & mask) == value)的情况下才会poison页面。
-	这允许对许多种类的页面进行压力测试。page_flags与/proc/kpageflags中的相
-	同。这些标志位在include/linux/kernel-page-flags.h中定义，并在
-	Documentation/admin-guide/mm/pagemap.rst中记录。
-
-* 架构特定的MCE注入器
-
-  x86 有 mce-inject, mce-test
-
-  在mce-test中的一些便携式hwpoison测试程序，见下文。
-
-引用
-====
-
-http://halobates.de/mce-lc09-2.pdf
-	09年LinuxCon的概述演讲
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
-	测试套件（在tsrc中的hwpoison特定可移植测试）。
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
-	x86特定的注入器
-
-
-限制
-====
-- 不是所有的页面类型都被支持，而且永远不会。大多数内核内部对象不能被恢
-  复，目前只有LRU页。
-
----
-Andi Kleen, 2009年10月
diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst
deleted file mode 100644
index a1c6d529b6ff..000000000000
--- a/Documentation/translations/zh_CN/vm/index.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/index.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-=================
-Linux内存管理文档
-=================
-
-这是一个关于Linux内存管理（mm）子系统内部的文档集，其中有不同层次的细节，包括注释
-和邮件列表的回复，用于阐述数据结构和算法的基本情况。如果你正在寻找关于简单分配内存的建
-议，请参阅(Documentation/translations/zh_CN/core-api/memory-allocation.rst)。
-对于控制和调整指南，请参阅(Documentation/admin-guide/mm/index)。
-TODO:待引用文档集被翻译完毕后请及时修改此处）
-
-.. toctree::
-   :maxdepth: 1
-
-   active_mm
-   balance
-   damon/index
-   free_page_reporting
-   highmem
-   ksm
-   frontswap
-   hmm
-   hwpoison
-   hugetlbfs_reserv
-   memory-model
-   mmu_notifier
-   numa
-   overcommit-accounting
-   page_frags
-   page_owner
-   page_table_check
-   remap_file_pages
-   split_page_table_lock
-   z3fold
-   zsmalloc
-
-TODOLIST:
-* arch_pgtable_helpers
-* free_page_reporting
-* hugetlbfs_reserv
-* page_migration
-* slub
-* transhuge
-* unevictable-lru
-* vmalloced-kernel-stacks
diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/vm/ksm.rst
deleted file mode 100644
index 83b0c73984da..000000000000
--- a/Documentation/translations/zh_CN/vm/ksm.rst
+++ /dev/null
@@ -1,70 +0,0 @@
-.. include:: ../disclaimer-zh_CN.rst
-
-:Original: Documentation/vm/ksm.rst
-
-:翻译:
-
-   徐鑫 xu xin <xu.xin16@zte.com.cn>
-
-============
-内核同页合并
-============
-
-KSM 是一种节省内存的数据去重功能，由CONFIG_KSM=y启用，并在2.6.32版本时被添加
-到Linux内核。详见 ``mm/ksm.c`` 的实现，以及http://lwn.net/Articles/306704和
-https://lwn.net/Articles/330589
-
-KSM的用户空间的接口在Documentation/translations/zh_CN/admin-guide/mm/ksm.rst
-文档中有描述。
-
-设计
-====
-
-概述
-----
-
-概述内容请见mm/ksm.c文档中的“DOC: Overview”
-
-逆映射
-------
-KSM维护着稳定树中的KSM页的逆映射信息。
-
-当KSM页面的共享数小于 ``max_page_sharing`` 的虚拟内存区域(VMAs)时，则代表了
-KSM页的稳定树其中的节点指向了一个rmap_item结构体类型的列表。同时，这个KSM页
-的 ``page->mapping`` 指向了该稳定树节点。
-
-如果共享数超过了阈值，KSM将给稳定树添加第二个维度。稳定树就变成链接一个或多
-个稳定树"副本"的"链"。每个副本都保留KSM页的逆映射信息，其中 ``page->mapping``
-指向该"副本"。
-
-每个链以及链接到该链中的所有"副本"强制不变的是，它们代表了相同的写保护内存
-内容，尽管任中一个"副本"是由同一片内存区的不同的KSM复制页所指向的。
-
-这样一来，相比与无限的逆映射链表，稳定树的查找计算复杂性不受影响。但在稳定树
-本身中不能有重复的KSM页面内容仍然是强制要求。
-
-由 ``max_page_sharing`` 强制决定的数据去重限制是必要的，以此来避免虚拟内存
-rmap链表变得过大。rmap的遍历具有O(N)的复杂度，其中N是共享页面的rmap_项（即
-虚拟映射）的数量，而这个共享页面的节点数量又被 ``max_page_sharing`` 所限制。
-因此，这有效地将线性O(N)计算复杂度从rmap遍历中分散到不同的KSM页面上。ksmd进
-程在稳定节点"链"上的遍历也是O(N)，但这个N是稳定树"副本"的数量，而不是rmap项
-的数量，因此它对ksmd性能没有显著影响。实际上，最佳稳定树"副本"的候选节点将
-保留在"副本"列表的开头。
-
-``max_page_sharing`` 的值设置得高了会促使更快的内存合并（因为将有更少的稳定
-树副本排队进入稳定节点chain->hlist）和更高的数据去重系数，但代价是在交换、压
-缩、NUMA平衡和页面迁移过程中可能导致KSM页的最大rmap遍历速度较慢。
-
-``stable_node_dups/stable_node_chains`` 的比值还受 ``max_page_sharing`` 调控
-的影响，高比值可能意味着稳定节点dup中存在碎片，这可以通过在ksmd中引入碎片算
-法来解决，该算法将rmap项从一个稳定节点dup重定位到另一个稳定节点dup，以便释放
-那些仅包含极少rmap项的稳定节点"dup"，但这可能会增加ksmd进程的CPU使用率，并可
-能会减慢应用程序在KSM页面上的只读计算。
-
-KSM会定期扫描稳定节点"链"中链接的所有稳定树"副本"，以便删减过时了的稳定节点。
-这种扫描的频率由 ``stable_node_chains_prune_millisecs`` 这个sysfs 接口定义。
-
-参考
-====
-内核代码请见mm/ksm.c。
-涉及的函数(mm_slot  ksm_scan  stable_node  rmap_item)。
diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst
deleted file mode 100644
index 013e30c88d72..000000000000
--- a/Documentation/translations/zh_CN/vm/memory-model.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/memory-model.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-============
-物理内存模型
-============
-
-系统中的物理内存可以用不同的方式进行寻址。最简单的情况是，物理内存从地址0开
-始，跨越一个连续的范围，直到最大的地址。然而，这个范围可能包含CPU无法访问的
-小孔隙。那么，在完全不同的地址可能有几个连续的范围。而且，别忘了NUMA，即不
-同的内存库连接到不同的CPU。
-
-Linux使用两种内存模型中的一种对这种多样性进行抽象。FLATMEM和SPARSEM。每
-个架构都定义了它所支持的内存模型，默认的内存模型是什么，以及是否有可能手动
-覆盖该默认值。
-
-所有的内存模型都使用排列在一个或多个数组中的 `struct page` 来跟踪物理页
-帧的状态。
-
-无论选择哪种内存模型，物理页框号（PFN）和相应的 `struct page` 之间都存
-在一对一的映射关系。
-
-每个内存模型都定义了 :c:func:`pfn_to_page` 和 :c:func:`page_to_pfn`
-帮助函数，允许从PFN到 `struct page` 的转换，反之亦然。
-
-FLATMEM
-=======
-
-最简单的内存模型是FLATMEM。这个模型适用于非NUMA系统的连续或大部分连续的
-物理内存。
-
-在FLATMEM内存模型中，有一个全局的 `mem_map` 数组来映射整个物理内存。对
-于大多数架构，孔隙在 `mem_map` 数组中都有条目。与孔洞相对应的 `struct page`
-对象从未被完全初始化。
-
-为了分配 `mem_map` 数组，架构特定的设置代码应该调用free_area_init()函数。
-然而，在调用memblock_free_all()函数之前，映射数组是不能使用的，该函数
-将所有的内存交给页分配器。
-
-一个架构可能会释放 `mem_map` 数组中不包括实际物理页的部分。在这种情况下，特
-定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。
-
-使用FLATMEM，PFN和 `struct page` 之间的转换是直接的。 `PFN - ARCH_PFN_OFFSET`
-是 `mem_map` 数组的一个索引。
-
-`ARCH_PFN_OFFSET` 定义了物理内存起始地址不同于0的系统的第一个页框号。
-
-SPARSEMEM
-=========
-
-SPARSEMEM是Linux中最通用的内存模型，它是唯一支持若干高级功能的内存模型，
-如物理内存的热插拔、非易失性内存设备的替代内存图和较大系统的内存图的延迟
-初始化。
-
-SPARSEMEM模型将物理内存显示为一个部分的集合。一个区段用mem_section结构
-体表示，它包含 `section_mem_map` ，从逻辑上讲，它是一个指向 `struct page`
-阵列的指针。然而，它被存储在一些其他的magic中，以帮助分区管理。区段的大小
-和最大区段数是使用 `SECTION_SIZE_BITS` 和 `MAX_PHYSMEM_BITS` 常量
-来指定的，这两个常量是由每个支持SPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS`
-是一个架构所支持的物理地址的实际宽度，而 `SECTION_SIZE_BITS` 是一个任
-意的值。
-
-最大的段数表示为 `NR_MEM_SECTIONS` ，定义为
-
-.. math::
-
-   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
-
-`mem_section` 对象被安排在一个叫做 `mem_sections` 的二维数组中。这个数组的
-大小和位置取决于 `CONFIG_SPARSEM_EXTREME` 和可能的最大段数:
-
-* 当 `CONFIG_SPARSEMEM_EXTREME` 被禁用时， `mem_sections` 数组是静态的，有
-  `NR_MEM_SECTIONS` 行。每一行持有一个 `mem_section` 对象。
-* 当 `CONFIG_SPARSEMEM_EXTREME` 被启用时， `mem_sections` 数组被动态分配。
-  每一行包含价值 `PAGE_SIZE` 的 `mem_section` 对象，行数的计算是为了适应所有的
-  内存区。
-
-架构设置代码应该调用sparse_init()来初始化内存区和内存映射。
-
-通过SPARSEMEM，有两种可能的方式将PFN转换为相应的 `struct page` --"classic sparse"和
- "sparse vmemmap"。选择是在构建时进行的，它由 `CONFIG_SPARSEMEM_VMEMMAP` 的
- 值决定。
-
-Classic sparse在page->flags中编码了一个页面的段号，并使用PFN的高位来访问映射该页
-框的段。在一个区段内，PFN是指向页数组的索引。
-
-Sparse vmemmapvmemmap使用虚拟映射的内存映射来优化pfn_to_page和page_to_pfn操
-作。有一个全局的 `struct page *vmemmap` 指针，指向一个虚拟连续的 `struct page`
-对象阵列。PFN是该数组的一个索引，`struct page` 从 `vmemmap` 的偏移量是该页的PFN。
-
-为了使用vmemmap，一个架构必须保留一个虚拟地址的范围，以映射包含内存映射的物理页，并
-确保 `vmemmap`指向该范围。此外，架构应该实现 :c:func:`vmemmap_populate` 方法，
-它将分配物理内存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊要求，
-它可以使用通用内存管理提供的默认 :c:func:`vmemmap_populate_basepages`。
-
-虚拟映射的内存映射允许将持久性内存设备的 `struct page` 对象存储在这些设备上预先分
-配的存储中。这种存储用vmem_altmap结构表示，最终通过一长串的函数调用传递给
-vmemmap_populate()。vmemmap_populate()实现可以使用 `vmem_altmap` 和
-:c:func:`vmemmap_alloc_block_buf` 助手来分配持久性内存设备上的内存映射。
-
-ZONE_DEVICE
-===========
-`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上，为设备驱动识别的物理地址范
-围提供 `struct page` `mem_map` 服务。 `ZONE_DEVICE` 的 "设备" 方面与以下
-事实有关：这些地址范围的页面对象从未被在线标记过，而且必须对设备进行引用，而不仅仅
-是页面，以保持内存被“锁定”以便使用。 `ZONE_DEVICE` ，通过 :c:func:`devm_memremap_pages` ，
-为给定的pfns范围执行足够的内存热插拔来开启 :c:func:`pfn_to_page`，
-:c:func:`page_to_pfn`, ，和 :c:func:`get_user_pages` 服务。由于页面引
-用计数永远不会低于1，所以页面永远不会被追踪为空闲内存，页面的 `struct list_head lru`
-空间被重新利用，用于向映射该内存的主机设备/驱动程序进行反向引用。
-
-虽然 `SPARSEMEM` 将内存作为一个区段的集合，可以选择收集并合成内存块，但
-`ZONE_DEVICE` 用户需要更小的颗粒度来填充 `mem_map` 。鉴于 `ZONE_DEVICE`
-内存从未被在线标记，因此它的内存范围从未通过sysfs内存热插拔api暴露在内存块边界
-上。这个实现依赖于这种缺乏用户接口的约束，允许子段大小的内存范围被指定给
-:c:func:`arch_add_memory` ，即内存热插拔的上半部分。子段支持允许2MB作为
-:c:func:`devm_memremap_pages` 的跨架构通用对齐颗粒度。
-
-`ZONE_DEVICE` 的用户是:
-
-* pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。
-
-* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` ，
-  以允许设备驱动程序协调与设备内存相关的内存管理事件，通常是GPU内存。参见/vm/hmm.rst。
-
-* p2pdma: 创建 `struct page` 对象，允许PCI/E拓扑结构中的peer设备协调它们之间的
-  直接DMA操作，即绕过主机内存。
diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst
deleted file mode 100644
index b29a37b33628..000000000000
--- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-:Original: Documentation/vm/mmu_notifier.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-
-什么时候需要页表锁内通知？
-==========================
-
-当清除一个pte/pmd时，我们可以选择通过在页表锁下（通知版的\*_clear_flush调用
-mmu_notifier_invalidate_range）通知事件。但这种通知并不是在所有情况下都需要的。
-
-对于二级TLB（非CPU TLB），如IOMMU TLB或设备TLB（当设备使用类似ATS/PASID的东西让
-IOMMU走CPU页表来访问进程的虚拟地址空间）。只有两种情况需要在清除pte/pmd时在持有页
-表锁的同时通知这些二级TLB：
-
-  A) 在mmu_notifier_invalidate_range_end()之前，支持页的地址被释放。
-  B) 一个页表项被更新以指向一个新的页面（COW，零页上的写异常，__replace_page()，...）。
-
-情况A很明显，你不想冒风险让设备写到一个现在可能被一些完全不同的任务使用的页面。
-
-情况B更加微妙。为了正确起见，它需要按照以下序列发生:
-
-  - 上页表锁
-  - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify())
-  - 设置页表项以指向新页
-
-如果在设置新的pte/pmd值之前，清除页表项之后没有进行通知，那么你就会破坏设备的C11或
-C++11等内存模型。
-
-考虑以下情况（设备使用类似于ATS/PASID的功能）。
-
-两个地址addrA和addrB，这样|addrA - addrB| >= PAGE_SIZE，我们假设它们是COW的
-写保护（B的其他情况也适用）。
-
-::
-
- [Time N] --------------------------------------------------------------------
- CPU-thread-0  {尝试写到addrA}
- CPU-thread-1  {尝试写到addrB}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {读取addrA并填充设备TLB}
- DEV-thread-2  {读取addrB并填充设备TLB}
- [Time N+1] ------------------------------------------------------------------
- CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
- CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+2] ------------------------------------------------------------------
- CPU-thread-0  {COW_step1: {更新页表以指向addrA的新页}}
- CPU-thread-1  {COW_step1: {更新页表以指向addrB的新页}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {写入addrA，这是对新页面的写入}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {}
- CPU-thread-3  {写入addrB，这是一个写入新页的过程}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+4] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+5] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {从旧页中读取addrA}
- DEV-thread-2  {从新页面读取addrB}
-
-所以在这里，因为在N+2的时候，清空页表项没有和通知一起作废二级TLB，设备在看到addrA的新值之前
-就看到了addrB的新值。这就破坏了设备的总内存序。
-
-当改变一个pte的写保护或指向一个新的具有相同内容的写保护页（KSM）时，将mmu_notifier_invalidate_range
-调用延迟到页表锁外的mmu_notifier_invalidate_range_end()是可以的。即使做页表更新的线程
-在释放页表锁后但在调用mmu_notifier_invalidate_range_end()前被抢占，也是如此。
diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst
deleted file mode 100644
index 6af412b924ad..000000000000
--- a/Documentation/translations/zh_CN/vm/numa.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-:Original: Documentation/vm/numa.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-始于1999年11月，作者： <kanoj@sgi.com>
-
-==========================
-何为非统一内存访问(NUMA)？
-==========================
-
-这个问题可以从几个视角来回答：硬件观点和Linux软件视角。
-
-从硬件角度看，NUMA系统是一个由多个组件或装配组成的计算机平台，每个组件可能包含0个或更多的CPU、
-本地内存和/或IO总线。为了简洁起见，并将这些物理组件/装配的硬件视角与软件抽象区分开来，我们在
-本文中称这些组件/装配为“单元”。
-
-每个“单元”都可以看作是系统的一个SMP[对称多处理器]子集——尽管独立的SMP系统所需的一些组件可能
-不会在任何给定的单元上填充。NUMA系统的单元通过某种系统互连连接在一起——例如，交叉开关或点对点
-链接是NUMA系统互连的常见类型。这两种类型的互连都可以聚合起来，以创建NUMA平台，其中的单元与其
-他单元有多个距离。
-
-对于Linux，感兴趣的NUMA平台主要是所谓的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中，
-所有的内存都是可见的，并且可以从连接到任何单元的任何CPU中访问，缓存一致性是由处理器缓存和/或
-系统互连在硬件中处理。
-
-内存访问时间和有效的内存带宽取决于包含CPU的单元或进行内存访问的IO总线距离包含目标内存的单元
-有多远。例如，连接到同一单元的CPU对内存的访问将比访问其他远程单元的内存经历更快的访问时间和
-更高的带宽。 NUMA平台可以在任何给定单元上访问多种远程距离的（其他）单元。
-
-平台供应商建立NUMA系统并不只是为了让软件开发人员的生活变得有趣。相反，这种架构是提供可扩展
-内存带宽的一种手段。然而，为了实现可扩展的内存带宽，系统和应用软件必须安排大部分的内存引用
-[cache misses]到“本地”内存——同一单元的内存，如果有的话——或者到最近的有内存的单元。
-
-这就自然而然有了Linux软件对NUMA系统的视角:
-
-Linux将系统的硬件资源划分为多个软件抽象，称为“节点”。Linux将节点映射到硬件平台的物理单元
-上，对一些架构的细节进行了抽象。与物理单元一样，软件节点可能包含0或更多的CPU、内存和/或IO
-总线。同样，对“较近”节点的内存访问——映射到较近单元的节点——通常会比对较远单元的访问经历更快
-的访问时间和更高的有效带宽。
-
-对于一些架构，如x86，Linux将“隐藏”任何代表没有内存连接的物理单元的节点，并将连接到该单元
-的任何CPU重新分配到代表有内存的单元的节点上。因此，在这些架构上，我们不能假设Linux将所有
-的CPU与一个给定的节点相关联，会看到相同的本地内存访问时间和带宽。
-
-此外，对于某些架构，同样以x86为例，Linux支持对额外节点的仿真。对于NUMA仿真，Linux会将现
-有的节点或者非NUMA平台的系统内存分割成多个节点。每个模拟的节点将管理底层单元物理内存的一部
-分。NUMA仿真对于在非NUMA平台上测试NUMA内核和应用功能是非常有用的，当与cpusets一起使用时，
-可以作为一种内存资源管理机制。[见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-对于每个有内存的节点，Linux构建了一个独立的内存管理子系统，有自己的空闲页列表、使用中页列表、
-使用统计和锁来调解访问。此外，Linux为每个内存区[DMA、DMA32、NORMAL、HIGH_MEMORY、MOVABLE
-中的一个或多个]构建了一个有序的“区列表”。zonelist指定了当一个选定的区/节点不能满足分配请求
-时要访问的区/节点。当一个区没有可用的内存来满足请求时，这种情况被称为“overflow 溢出”或
-“fallback 回退”。
-
-由于一些节点包含多个包含不同类型内存的区，Linux必须决定是否对区列表进行排序，使分配回退到不同
-节点上的相同区类型，或同一节点上的不同区类型。这是一个重要的考虑因素，因为有些区，如DMA或DMA32，
-代表了相对稀缺的资源。Linux选择了一个默认的Node ordered zonelist。这意味着在使用按NUMA距
-离排序的远程节点之前，它会尝试回退到同一节点的其他分区。
-
-默认情况下，Linux会尝试从执行请求的CPU被分配到的节点中满足内存分配请求。具体来说，Linux将试
-图从请求来源的节点的适当分区列表中的第一个节点进行分配。这被称为“本地分配”。如果“本地”节点不能
-满足请求，内核将检查所选分区列表中其他节点的区域，寻找列表中第一个能满足请求的区域。
-
-本地分配将倾向于保持对分配的内存的后续访问 “本地”的底层物理资源和系统互连——只要内核代表其分配
-一些内存的任务后来不从该内存迁移。Linux调度器知道平台的NUMA拓扑结构——体现在“调度域”数据结构
-中[见 Documentation/scheduler/sched-domains.rst]——并且调度器试图尽量减少任务迁移到遥
-远的调度域中。然而，调度器并没有直接考虑到任务的NUMA足迹。因此，在充分不平衡的情况下，任务可
-以在节点之间迁移，远离其初始节点和内核数据结构。
-
-系统管理员和应用程序设计者可以使用各种CPU亲和命令行接口，如taskset(1)和numactl(1)，以及程
-序接口，如sched_setaffinity(2)，来限制任务的迁移，以改善NUMA定位。此外，人们可以使用
-Linux NUMA内存策略修改内核的默认本地分配行为。 [见
-:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
-
-系统管理员可以使用控制组和CPUsets限制非特权用户在调度或NUMA命令和功能中可以指定的CPU和节点
-的内存。 [见 Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-在不隐藏无内存节点的架构上，Linux会在分区列表中只包括有内存的区域[节点]。这意味着对于一个无
-内存的节点，“本地内存节点”——CPU节点的分区列表中的第一个区域的节点——将不是节点本身。相反，它
-将是内核在建立分区列表时选择的离它最近的有内存的节点。所以，默认情况下，本地分配将由内核提供
-最近的可用内存来完成。这是同一机制的结果，该机制允许这种分配在一个包含内存的节点溢出时回退到
-其他附近的节点。
-
-一些内核分配不希望或不能容忍这种分配回退行为。相反，他们想确保他们从指定的节点获得内存，或者
-得到通知说该节点没有空闲内存。例如，当一个子系统分配每个CPU的内存资源时，通常是这种情况。
-
-一个典型的分配模式是使用内核的numa_node_id()或CPU_to_node()函数获得“当前CPU”所在节点的
-节点ID，然后只从返回的节点ID请求内存。当这样的分配失败时，请求的子系统可以恢复到它自己的回退
-路径。板块内核内存分配器就是这样的一个例子。或者，子系统可以选择在分配失败时禁用或不启用自己。
-内核分析子系统就是这样的一个例子。
-
-如果架构支持——不隐藏无内存节点，那么连接到无内存节点的CPU将总是产生回退路径的开销，或者一些
-子系统如果试图完全从无内存的节点分配内存，将无法初始化。为了透明地支持这种架构，内核子系统可
-以使用numa_mem_id()或cpu_to_mem()函数来定位调用或指定CPU的“本地内存节点”。同样，这是同
-一个节点，默认的本地页分配将从这个节点开始尝试。
diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
deleted file mode 100644
index 8765cb118f24..000000000000
--- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-:Original: Documentation/vm/overcommit-accounting.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-
-==============
-超量使用审计
-==============
-
-Linux内核支持下列超量使用处理模式
-
-0
-	启发式超量使用处理。拒绝明显的地址空间超量使用。用于一个典型的系统。
-	它确保严重的疯狂分配失败，同时允许超量使用以减少swap的使用。在这种模式下，
-	允许root分配稍多的内存。这是默认的。
-1
-	总是超量使用。适用于一些科学应用。经典的例子是使用稀疏数组的代码，只是依赖
-	几乎完全由零页组成的虚拟内存
-
-2
-	不超量使用。系统提交的总地址空间不允许超过swap+一个可配置的物理RAM的数量
-	（默认为50%）。根据你使用的数量，在大多数情况下，这意味着一个进程在访问页面时
-	不会被杀死，但会在内存分配上收到相应的错误。
-
-	对于那些想保证他们的内存分配在未来可用而又不需要初始化每一个页面的应用程序来说
-	是很有用的。
-
-超量使用策略是通过sysctl  `vm.overcommit_memory` 设置的。
-
-可以通过 `vm.overcommit_ratio` （百分比）或 `vm.overcommit_kbytes` （绝对值）
-来设置超限数量。这些只有在 `vm.overcommit_memory` 被设置为2时才有效果。
-
-在 ``/proc/meminfo`` 中可以分别以CommitLimit和Committed_AS的形式查看当前
-的超量使用和提交量。
-
-陷阱
-====
-
-C语言的堆栈增长是一个隐含的mremap。如果你想得到绝对的保证，并在接近边缘的地方运行，
-你 **必须** 为你认为你需要的最大尺寸的堆栈进行mmap。对于典型的堆栈使用来说，这并
-不重要，但如果你真的非常关心的话，这就是一个值得关注的案例。
-
-
-在模式2中，MAP_NORESERVE标志被忽略。
-
-
-它是如何工作的
-==============
-
-超量使用是基于以下规则
-
-对于文件映射
-	| SHARED or READ-only	-	0 cost (该文件是映射而不是交换)
-	| PRIVATE WRITABLE	-	每个实例的映射大小
-
-对于匿名或者 ``/dev/zero`` 映射
-	| SHARED			-	映射的大小
-	| PRIVATE READ-only	-	0 cost (但作用不大)
-	| PRIVATE WRITABLE	-	每个实例的映射大小
-
-额外的计数
-	| 通过mmap制作可写副本的页面
-	| 从同一池中提取的shmfs内存
-
-状态
-====
-
-*	我们核算mmap内存映射
-*	我们核算mprotect在提交中的变化
-*	我们核算mremap的大小变化
-*	我们的审计 brk
-*	审计munmap
-*	我们在/proc中报告commit 状态
-*	核对并检查分叉的情况
-*	审查堆栈处理/执行中的构建
-*	叙述SHMfs的情况
-*	实现实际限制的执行
-
-待续
-====
-*	ptrace 页计数（这很难）。
diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst
deleted file mode 100644
index ad27fed33634..000000000000
--- a/Documentation/translations/zh_CN/vm/page_frags.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-:Original: Documentation/vm/page_frag.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-页面片段
-========
-
-一个页面片段是一个任意长度的任意偏移的内存区域，它位于一个0或更高阶的复合页面中。
-该页中的多个碎片在该页的引用计数器中被单独计算。
-
-page_frag函数，page_frag_alloc和page_frag_free，为页面片段提供了一个简单
-的分配框架。这被网络堆栈和网络设备驱动使用，以提供一个内存的支持区域，作为
-sk_buff->head使用，或者用于skb_shared_info的 “frags” 部分。
-
-为了使用页面片段API，需要一个支持页面片段的缓冲区。这为碎片分配提供了一个中心点，
-并允许多个调用使用一个缓存的页面。这样做的好处是可以避免对get_page的多次调用，
-这在分配时开销可能会很大。然而，由于这种缓存的性质，要求任何对缓存的调用都要受到每
-个CPU的限制，或者每个CPU的限制，并在执行碎片分配时强制禁止中断。
-
-网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用
-netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
-被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是
-它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用，因为这些函数
-将禁用中断，而 ”napi“ 前缀的函数只可以在softirq上下文中使用。
-
-许多网络设备驱动程序使用类似的方法来分配页面片段，但页面片段是在环或描述符级别上
-缓存的。为了实现这些情况，有必要提供一种拆解页面缓存的通用方法。出于这个原因，
-__page_frag_cache_drain被实现了。它允许通过一次调用从一个页面释放多个引用。
-这样做的好处是，它允许清理被添加到一个页面的多个引用，以避免每次分配都调用
-get_page。
-
-Alexander Duyck，2016年11月29日。
diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst
deleted file mode 100644
index 9e951fabba9d..000000000000
--- a/Documentation/translations/zh_CN/vm/page_owner.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-:Original: Documentation/vm/page_owner.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-================================
-page owner: 跟踪谁分配的每个页面
-================================
-
-概述
-====
-
-page owner是用来追踪谁分配的每一个页面。它可以用来调试内存泄漏或找到内存占用者。
-当分配发生时，有关分配的信息，如调用堆栈和页面的顺序被存储到每个页面的特定存储中。
-当我们需要了解所有页面的状态时，我们可以获得并分析这些信息。
-
-尽管我们已经有了追踪页面分配/释放的tracepoint，但用它来分析谁分配的每个页面是
-相当复杂的。我们需要扩大跟踪缓冲区，以防止在用户空间程序启动前出现重叠。而且，启
-动的程序会不断地将跟踪缓冲区转出，供以后分析，这将会改变系统的行为，会产生更多的
-可能性，而不是仅仅保留在内存中，所以不利于调试。
-
-页面所有者也可以用于各种目的。例如，可以通过每个页面的gfp标志信息获得精确的碎片
-统计。如果启用了page owner，它就已经实现并激活了。我们非常欢迎其他用途。
-
-page owner在默认情况下是禁用的。所以，如果你想使用它，你需要在你的启动cmdline
-中加入"page_owner=on"。如果内核是用page owner构建的，并且由于没有启用启动
-选项而在运行时禁用page owner，那么运行时的开销是很小的。如果在运行时禁用，它不
-需要内存来存储所有者信息，所以没有运行时内存开销。而且，页面所有者在页面分配器的
-热路径中只插入了两个不可能的分支，如果不启用，那么分配就会像没有页面所有者的内核
-一样进行。这两个不可能的分支应该不会影响到分配的性能，特别是在静态键跳转标签修补
-功能可用的情况下。以下是由于这个功能而导致的内核代码大小的变化。
-
-- 没有page owner::
-
-   text    data     bss     dec     hex filename
-   48392   2333     644   51369    c8a9 mm/page_alloc.o
-
-- 有page owner::
-
-   text    data     bss     dec     hex filename
-   48800   2445     644   51889    cab1 mm/page_alloc.o
-   6662     108      29    6799    1a8f mm/page_owner.o
-   1025       8       8    1041     411 mm/page_ext.o
-
-虽然总共增加了8KB的代码，但page_alloc.o增加了520字节，其中不到一半是在hotpath
-中。构建带有page owner的内核，并在需要时打开它，将是调试内核内存问题的最佳选择。
-
-有一个问题是由实现细节引起的。页所有者将信息存储到struct page扩展的内存中。这
-个内存的初始化时间比稀疏内存系统中的页面分配器启动的时间要晚一些，所以，在初始化
-之前，许多页面可以被分配，但它们没有所有者信息。为了解决这个问题，这些早期分配的
-页面在初始化阶段被调查并标记为分配。虽然这并不意味着它们有正确的所有者信息，但至
-少，我们可以更准确地判断该页是否被分配。在2GB内存的x86-64虚拟机上，有13343
-个早期分配的页面被捕捉和标记，尽管它们大部分是由结构页扩展功能分配的。总之，在这
-之后，没有任何页面处于未追踪状态。
-
-使用方法
-========
-
-1) 构建用户空间的帮助::
-
-	cd tools/vm
-	make page_owner_sort
-
-2) 启用page owner: 添加 "page_owner=on" 到 boot cmdline.
-
-3) 做你想调试的工作。
-
-4) 分析来自页面所有者的信息::
-
-	cat /sys/kernel/debug/page_owner > page_owner_full.txt
-	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
-
-   ``page_owner_full.txt`` 的一般输出情况如下(输出信息无翻译价值)::
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-   ``page_owner_sort`` 工具忽略了 ``PFN`` 行，将剩余的行放在buf中，使用regexp提
-   取页序值，计算buf的次数和页数，最后根据参数进行排序。
-
-   在 ``sorted_page_owner.txt`` 中可以看到关于谁分配了每个页面的结果。一般输出::
-
-	XXX times, XXX pages:
-	Page allocated via order XXX, ...
-	// Detailed stack
-
-   默认情况下， ``page_owner_sort`` 是根据buf的时间来排序的。如果你想
-   按buf的页数排序，请使用-m参数。详细的参数是:
-
-   基本函数:
-
-	Sort:
-		-a		按内存分配时间排序
-		-m		按总内存排序
-		-p		按pid排序。
-		-P		按tgid排序。
-		-r		按内存释放时间排序。
-		-s		按堆栈跟踪排序。
-		-t		按时间排序（默认）。
-
-   其它函数:
-
-	Cull:
-		-c		通过比较堆栈跟踪而不是总块来进行剔除。
-
-	Filter:
-		-f		过滤掉内存已被释放的块的信息。
diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst
deleted file mode 100644
index a29fc1b360e6..000000000000
--- a/Documentation/translations/zh_CN/vm/page_table_check.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:Original: Documentation/vm/page_table_check.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-========
-页表检查
-========
-
-概述
-====
-
-页表检查允许通过确保防止某些类型的内存损坏来强化内核。
-
-当新的页面可以从用户空间访问时，页表检查通过将它们的页表项（PTEs PMD等）添加到页表中来执行额外
-的验证。
-
-在检测到损坏的情况下，内核会被崩溃。页表检查有一个小的性能和内存开销。因此，它在默认情况下是禁用
-的，但是在额外的加固超过性能成本的系统上，可以选择启用。另外，由于页表检查是同步的，它可以帮助调
-试双映射内存损坏问题，在错误的映射发生时崩溃内核，而不是在内存损坏错误发生后内核崩溃。
-
-双重映射检测逻辑
-================
-
-+-------------------+-------------------+-------------------+------------------+
-| Current Mapping   | New mapping       | Permissions       | Rule             |
-+===================+===================+===================+==================+
-| Anonymous         | Anonymous         | Read              | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Named             | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Anonymous         | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Named             | Any               | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-
-启用页表检查
-============
-
-用以下方法构建内核:
-
-- PAGE_TABLE_CHECK=y
-  注意，它只能在ARCH_SUPPORTS_PAGE_TABLE_CHECK可用的平台上启用。
-
-- 使用 "page_table_check=on" 内核参数启动。
-
-可以选择用PAGE_TABLE_CHECK_ENFORCED来构建内核，以便在没有额外的内核参数的情况下获得页表
-支持。
diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst
deleted file mode 100644
index af6b7e28af23..000000000000
--- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:Original: Documentation/vm/remap_file_pages.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-==============================
-remap_file_pages()系统调用
-==============================
-
-remap_file_pages()系统调用被用来创建一个非线性映射，也就是说，在这个映射中，
-文件的页面被无序映射到内存中。使用remap_file_pages()比重复调用mmap(2)的好
-处是，前者不需要内核创建额外的VMA（虚拟内存区）数据结构。
-
-支持非线性映射需要在内核虚拟内存子系统中编写大量的non-trivial的代码，包括热
-路径。另外，为了使非线性映射工作，内核需要一种方法来区分正常的页表项和带有文件
-偏移的项（pte_file）。内核为达到这个目的在PTE中保留了标志。PTE标志是稀缺资
-源，特别是在某些CPU架构上。如果能腾出这个标志用于其他用途就更好了。
-
-幸运的是，在生活中并没有很多remap_file_pages()的用户。只知道有一个企业的RDBMS
-实现在32位系统上使用这个系统调用来映射比32位虚拟地址空间线性尺寸更大的文件。
-由于64位系统的广泛使用，这种使用情况已经不重要了。
-
-syscall被废弃了，现在用一个模拟来代替它。仿真会创建新的VMA，而不是非线性映射。
-对于remap_file_pages()的少数用户来说，它的工作速度会变慢，但ABI被保留了。
-
-仿真的一个副作用（除了性能之外）是，由于额外的VMA，用户可以更容易达到
-vm.max_map_count的限制。关于限制的更多细节，请参见DEFAULT_MAX_MAP_COUNT
-的注释。
diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
deleted file mode 100644
index 50694d97c426..000000000000
--- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-:Original: Documentation/vm/split_page_table_lock.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-=================================
-分页表锁（split page table lock）
-=================================
-
-最初，mm->page_table_lock spinlock保护了mm_struct的所有页表。但是这种方
-法导致了多线程应用程序的缺页异常可扩展性差，因为对锁的争夺很激烈。为了提高可扩
-展性，我们引入了分页表锁。
-
-有了分页表锁，我们就有了单独的每张表锁来顺序化对表的访问。目前，我们对PTE和
-PMD表使用分页锁。对高层表的访问由mm->page_table_lock保护。
-
-有一些辅助工具来锁定/解锁一个表和其他访问器函数:
-
- - pte_offset_map_lock()
-	映射pte并获取PTE表锁，返回所取锁的指针；
- - pte_unmap_unlock()
-	解锁和解映射PTE表；
- - pte_alloc_map_lock()
-	如果需要的话，分配PTE表并获取锁，如果分配失败，返回已获取的锁的指针
-	或NULL；
- - pte_lockptr()
-	返回指向PTE表锁的指针；
- - pmd_lock()
-	取得PMD表锁，返回所取锁的指针。
- - pmd_lockptr()
-	返回指向PMD表锁的指针；
-
-如果CONFIG_SPLIT_PTLOCK_CPUS（通常为4）小于或等于NR_CPUS，则在编译
-时启用PTE表的分页表锁。如果分页锁被禁用，所有的表都由mm->page_table_lock
-来保护。
-
-如果PMD表启用了分页锁，并且架构支持它，那么PMD表的分页锁就会被启用（见
-下文）。
-
-Hugetlb 和分页表锁
-==================
-
-Hugetlb可以支持多种页面大小。我们只对PMD级别使用分页锁，但不对PUD使用。
-
-Hugetlb特定的辅助函数:
-
- - huge_pte_lock()
-	对PMD_SIZE页面采取pmd分割锁，否则mm->page_table_lock；
- - huge_pte_lockptr()
-	返回指向表锁的指针。
-
-架构对分页表锁的支持
-====================
-
-没有必要特别启用PTE分页表锁：所有需要的东西都由pgtable_pte_page_ctor()
-和pgtable_pte_page_dtor()完成，它们必须在PTE表分配/释放时被调用。
-
-确保架构不使用slab分配器来分配页表：slab使用page->slab_cache来分配其页
-面。这个区域与page->ptl共享存储。
-
-PMD分页锁只有在你有两个以上的页表级别时才有意义。
-
-启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor()，在释放时调
-用pgtable_pmd_page_dtor()。
-
-分配通常发生在pmd_alloc_one()中，释放发生在pmd_free()和pmd_free_tlb()
-中，但要确保覆盖所有的PMD表分配/释放路径：即X86_PAE在pgd_alloc()中预先
-分配一些PMD。
-
-一切就绪后，你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
-
-注意：pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必
-须正确处理。
-
-page->ptl
-=========
-
-page->ptl用于访问分割页表锁，其中'page'是包含该表的页面struct page。它
-与page->private（以及union中的其他几个字段）共享存储。
-
-为了避免增加struct page的大小并获得最佳性能，我们使用了一个技巧:
-
- - 如果spinlock_t适合于long，我们使用page->ptr作为spinlock，这样我们
-   就可以避免间接访问并节省一个缓存行。
- - 如果spinlock_t的大小大于long的大小，我们使用page->ptl作为spinlock_t
-   的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的
-   情况下使用分页锁，但由于间接访问而多花了一个缓存行。
-
-PTE表的spinlock_t分配在pgtable_pte_page_ctor()中，PMD表的spinlock_t
-分配在pgtable_pmd_page_ctor()中。
-
-请不要直接访问page->ptl - -使用适当的辅助函数。
diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst
deleted file mode 100644
index 57204aa08caa..000000000000
--- a/Documentation/translations/zh_CN/vm/z3fold.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-:Original: Documentation/vm/z3fold.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-
-======
-z3fold
-======
-
-z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。
-它是zbud的衍生物，允许更高的压缩率，保持其前辈的简单性和确定性。
-
-z3fold和zbud的主要区别是:
-
-* 与zbud不同的是，z3fold允许最大的PAGE_SIZE分配。
-* z3fold在其页面中最多可以容纳3个压缩页面
-* z3fold本身没有输出任何API，因此打算通过zpool的API来使用
-
-为了保持确定性和简单性，z3fold，就像zbud一样，总是在每页存储一个整数的压缩页，但是
-它最多可以存储3页，不像zbud最多可以存储2页。因此压缩率达到2.7倍左右，而zbud的压缩
-率是1.7倍左右。
-
-不像zbud（但也像zsmalloc），z3fold_alloc()那样不返回一个可重复引用的指针。相反，它
-返回一个无符号长句柄，它编码了被分配对象的实际位置。
-
-保持有效的压缩率接近于zsmalloc，z3fold不依赖于MMU的启用，并提供更可预测的回收行
-为，这使得它更适合于小型和反应迅速的系统。
diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst
deleted file mode 100644
index 29e9c70a8eb6..000000000000
--- a/Documentation/translations/zh_CN/vm/zsmalloc.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-:Original: Documentation/vm/zs_malloc.rst
-
-:翻译:
-
- 司延腾 Yanteng Si <siyanteng@loongson.cn>
-
-:校译:
-
-========
-zsmalloc
-========
-
-这个分配器是为与zram一起使用而设计的。因此，该分配器应该在低内存条件下工作良好。特别是，
-它从未尝试过higher order页面的分配，这在内存压力下很可能会失败。另一方面，如果我们只
-是使用单（0-order）页，它将遭受非常高的碎片化 - 任何大小为PAGE_SIZE/2或更大的对象将
-占据整个页面。这是其前身（xvmalloc）的主要问题之一。
-
-为了克服这些问题，zsmalloc分配了一堆0-order页面，并使用各种"struct page"字段将它
-们链接起来。这些链接的页面作为一个单一的higher order页面，即一个对象可以跨越0-order
-页面的边界。代码将这些链接的页面作为一个实体，称为zspage。
-
-为了简单起见，zsmalloc只能分配大小不超过PAGE_SIZE的对象，因为这满足了所有当前用户的
-要求（在最坏的情况下，页面是不可压缩的，因此以"原样"即未压缩的形式存储）。对于大于这
-个大小的分配请求，会返回失败（见zs_malloc）。
-
-此外，zs_malloc()并不返回一个可重复引用的指针。相反，它返回一个不透明的句柄（无符号
-长），它编码了被分配对象的实际位置。这种间接性的原因是zsmalloc并不保持zspages的永久
-映射，因为这在32位系统上会导致问题，因为内核空间映射的VA区域非常小。因此，在使用分配
-的内存之前，对象必须使用zs_map_object()进行映射以获得一个可用的指针，随后使用
-zs_unmap_object()解除映射。
-
-stat
-====
-
-通过CONFIG_ZSMALLOC_STAT，我们可以通过 ``/sys/kernel/debug/zsmalloc/<user name>``
-看到zsmalloc内部信息。下面是一个统计输出的例子。::
-
- # cat /sys/kernel/debug/zsmalloc/zram0/classes
-
- class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ...
-    ...
-     9   176           0            1           186        129          8                4
-    10   192           1            0          2880       2872        135                3
-    11   208           0            1           819        795         42                2
-    12   224           0            1           219        159         12                4
-    ...
-    ...
-
-
-class
-	索引
-size
-	zspage存储对象大小
-almost_empty
-	ZS_ALMOST_EMPTY zspage的数量（见下文）。
-almost_full
-	ZS_ALMOST_FULL zspage的数量(见下图)
-obj_allocated
-	已分配对象的数量
-obj_used
-	分配给用户的对象的数量
-pages_used
-	为该类分配的页数
-pages_per_zspage
-	组成一个zspage的0-order页面的数量
-
-当n <= N / f时，我们将一个zspage分配给ZS_ALMOST_EMPTYfullness组，其中
-
-* n = 已分配对象的数量
-* N = zspage可以存储的对象总数
-* f = fullness_threshold_frac(即，目前是4个)
-
-同样地，我们将zspage分配给:
-
-* ZS_ALMOST_FULL  when n > N / f
-* ZS_EMPTY        when n == 0
-* ZS_FULL         when n == N
diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst
index e1ce9d8c06f8..e97d7d578751 100644
--- a/Documentation/translations/zh_TW/index.rst
+++ b/Documentation/translations/zh_TW/index.rst
@@ -128,7 +128,7 @@ TODOList:
 * security/index
 * sound/index
 * crypto/index
-* vm/index
+* mm/index
 * bpf/index
 * usb/index
 * PCI/index
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
deleted file mode 100644
index bc74f5643008..000000000000
--- a/Documentation/vm/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-page-types
-slabinfo
diff --git a/Documentation/vm/active_mm.rst b/Documentation/vm/active_mm.rst
deleted file mode 100644
index 6f8269c284ed..000000000000
--- a/Documentation/vm/active_mm.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-.. _active_mm:
-
-=========
-Active MM
-=========
-
-::
-
- List:       linux-kernel
- Subject:    Re: active_mm
- From:       Linus Torvalds <torvalds () transmeta ! com>
- Date:       1999-07-30 21:36:24
-
- Cc'd to linux-kernel, because I don't write explanations all that often,
- and when I do I feel better about more people reading them.
-
- On Fri, 30 Jul 1999, David Mosberger wrote:
- >
- > Is there a brief description someplace on how "mm" vs. "active_mm" in
- > the task_struct are supposed to be used?  (My apologies if this was
- > discussed on the mailing lists---I just returned from vacation and
- > wasn't able to follow linux-kernel for a while).
-
- Basically, the new setup is:
-
-  - we have "real address spaces" and "anonymous address spaces". The
-    difference is that an anonymous address space doesn't care about the
-    user-level page tables at all, so when we do a context switch into an
-    anonymous address space we just leave the previous address space
-    active.
-
-    The obvious use for a "anonymous address space" is any thread that
-    doesn't need any user mappings - all kernel threads basically fall into
-    this category, but even "real" threads can temporarily say that for
-    some amount of time they are not going to be interested in user space,
-    and that the scheduler might as well try to avoid wasting time on
-    switching the VM state around. Currently only the old-style bdflush
-    sync does that.
-
-  - "tsk->mm" points to the "real address space". For an anonymous process,
-    tsk->mm will be NULL, for the logical reason that an anonymous process
-    really doesn't _have_ a real address space at all.
-
-  - however, we obviously need to keep track of which address space we
-    "stole" for such an anonymous user. For that, we have "tsk->active_mm",
-    which shows what the currently active address space is.
-
-    The rule is that for a process with a real address space (ie tsk->mm is
-    non-NULL) the active_mm obviously always has to be the same as the real
-    one.
-
-    For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
-    "borrowed" mm while the anonymous process is running. When the
-    anonymous process gets scheduled away, the borrowed address space is
-    returned and cleared.
-
- To support all that, the "struct mm_struct" now has two counters: a
- "mm_users" counter that is how many "real address space users" there are,
- and a "mm_count" counter that is the number of "lazy" users (ie anonymous
- users) plus one if there are any real users.
-
- Usually there is at least one real user, but it could be that the real
- user exited on another CPU while a lazy user was still active, so you do
- actually get cases where you have a address space that is _only_ used by
- lazy users. That is often a short-lived state, because once that thread
- gets scheduled away in favour of a real thread, the "zombie" mm gets
- released because "mm_count" becomes zero.
-
- Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
- more. "init_mm" should be considered just a "lazy context when no other
- context is available", and in fact it is mainly used just at bootup when
- no real VM has yet been created. So code that used to check
-
- 	if (current->mm == &init_mm)
-
- should generally just do
-
- 	if (!current->mm)
-
- instead (which makes more sense anyway - the test is basically one of "do
- we have a user context", and is generally done by the page fault handler
- and things like that).
-
- Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
- because it slightly changes the interfaces to accommodate the alpha (who
- would have thought it, but the alpha actually ends up having one of the
- ugliest context switch codes - unlike the other architectures where the MM
- and register state is separate, the alpha PALcode joins the two, and you
- need to switch both together).
-
- (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
deleted file mode 100644
index cbaee9e59241..000000000000
--- a/Documentation/vm/arch_pgtable_helpers.rst
+++ /dev/null
@@ -1,260 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _arch_page_table_helpers:
-
-===============================
-Architecture Page Table Helpers
-===============================
-
-Generic MM expects architectures (with MMU) to provide helpers to create, access
-and modify page table entries at various level for different memory functions.
-These page table helpers need to conform to a common semantics across platforms.
-Following tables describe the expected semantics which can also be tested during
-boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
-test need to be in sync.
-
-
-PTE Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pte_same                  | Tests whether both PTE entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pte_bad                   | Tests a non-table mapped PTE                     |
-+---------------------------+--------------------------------------------------+
-| pte_present               | Tests a valid mapped PTE                         |
-+---------------------------+--------------------------------------------------+
-| pte_young                 | Tests a young PTE                                |
-+---------------------------+--------------------------------------------------+
-| pte_dirty                 | Tests a dirty PTE                                |
-+---------------------------+--------------------------------------------------+
-| pte_write                 | Tests a writable PTE                             |
-+---------------------------+--------------------------------------------------+
-| pte_special               | Tests a special PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_protnone              | Tests a PROT_NONE PTE                            |
-+---------------------------+--------------------------------------------------+
-| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
-+---------------------------+--------------------------------------------------+
-| pte_soft_dirty            | Tests a soft dirty PTE                           |
-+---------------------------+--------------------------------------------------+
-| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
-+---------------------------+--------------------------------------------------+
-| pte_mkyoung               | Creates a young PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkold                 | Creates an old PTE                               |
-+---------------------------+--------------------------------------------------+
-| pte_mkdirty               | Creates a dirty PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkclean               | Creates a clean PTE                              |
-+---------------------------+--------------------------------------------------+
-| pte_mkwrite               | Creates a writable PTE                           |
-+---------------------------+--------------------------------------------------+
-| pte_wrprotect             | Creates a write protected PTE                    |
-+---------------------------+--------------------------------------------------+
-| pte_mkspecial             | Creates a special PTE                            |
-+---------------------------+--------------------------------------------------+
-| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
-+---------------------------+--------------------------------------------------+
-| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
-+---------------------------+--------------------------------------------------+
-| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
-+---------------------------+--------------------------------------------------+
-| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
-+---------------------------+--------------------------------------------------+
-| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
-+---------------------------+--------------------------------------------------+
-| pte_mknotpresent          | Invalidates a mapped PTE                         |
-+---------------------------+--------------------------------------------------+
-| ptep_clear                | Clears a PTE                                     |
-+---------------------------+--------------------------------------------------+
-| ptep_get_and_clear        | Clears and returns PTE                           |
-+---------------------------+--------------------------------------------------+
-| ptep_get_and_clear_full   | Clears and returns PTE (batched PTE unmap)       |
-+---------------------------+--------------------------------------------------+
-| ptep_test_and_clear_young | Clears young from a PTE                          |
-+---------------------------+--------------------------------------------------+
-| ptep_set_wrprotect        | Converts into a write protected PTE              |
-+---------------------------+--------------------------------------------------+
-| ptep_set_access_flags     | Converts into a more permissive PTE              |
-+---------------------------+--------------------------------------------------+
-
-
-PMD Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pmd_same                  | Tests whether both PMD entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pmd_bad                   | Tests a non-table mapped PMD                     |
-+---------------------------+--------------------------------------------------+
-| pmd_leaf                  | Tests a leaf mapped PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
-+---------------------------+--------------------------------------------------+
-| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
-+---------------------------+--------------------------------------------------+
-| pmd_present               | Tests a valid mapped PMD                         |
-+---------------------------+--------------------------------------------------+
-| pmd_young                 | Tests a young PMD                                |
-+---------------------------+--------------------------------------------------+
-| pmd_dirty                 | Tests a dirty PMD                                |
-+---------------------------+--------------------------------------------------+
-| pmd_write                 | Tests a writable PMD                             |
-+---------------------------+--------------------------------------------------+
-| pmd_special               | Tests a special PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_protnone              | Tests a PROT_NONE PMD                            |
-+---------------------------+--------------------------------------------------+
-| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
-+---------------------------+--------------------------------------------------+
-| pmd_soft_dirty            | Tests a soft dirty PMD                           |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
-+---------------------------+--------------------------------------------------+
-| pmd_mkyoung               | Creates a young PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkold                 | Creates an old PMD                               |
-+---------------------------+--------------------------------------------------+
-| pmd_mkdirty               | Creates a dirty PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkclean               | Creates a clean PMD                              |
-+---------------------------+--------------------------------------------------+
-| pmd_mkwrite               | Creates a writable PMD                           |
-+---------------------------+--------------------------------------------------+
-| pmd_wrprotect             | Creates a write protected PMD                    |
-+---------------------------+--------------------------------------------------+
-| pmd_mkspecial             | Creates a special PMD                            |
-+---------------------------+--------------------------------------------------+
-| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
-+---------------------------+--------------------------------------------------+
-| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
-+---------------------------+--------------------------------------------------+
-| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
-+---------------------------+--------------------------------------------------+
-| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
-+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
-+---------------------------+--------------------------------------------------+
-| pmd_set_huge              | Creates a PMD huge mapping                       |
-+---------------------------+--------------------------------------------------+
-| pmd_clear_huge            | Clears a PMD huge mapping                        |
-+---------------------------+--------------------------------------------------+
-| pmdp_get_and_clear        | Clears a PMD                                     |
-+---------------------------+--------------------------------------------------+
-| pmdp_get_and_clear_full   | Clears a PMD                                     |
-+---------------------------+--------------------------------------------------+
-| pmdp_test_and_clear_young | Clears young from a PMD                          |
-+---------------------------+--------------------------------------------------+
-| pmdp_set_wrprotect        | Converts into a write protected PMD              |
-+---------------------------+--------------------------------------------------+
-| pmdp_set_access_flags     | Converts into a more permissive PMD              |
-+---------------------------+--------------------------------------------------+
-
-
-PUD Page Table Helpers
-======================
-
-+---------------------------+--------------------------------------------------+
-| pud_same                  | Tests whether both PUD entries are the same      |
-+---------------------------+--------------------------------------------------+
-| pud_bad                   | Tests a non-table mapped PUD                     |
-+---------------------------+--------------------------------------------------+
-| pud_leaf                  | Tests a leaf mapped PUD                          |
-+---------------------------+--------------------------------------------------+
-| pud_huge                  | Tests a HugeTLB mapped PUD                       |
-+---------------------------+--------------------------------------------------+
-| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
-+---------------------------+--------------------------------------------------+
-| pud_present               | Tests a valid mapped PUD                         |
-+---------------------------+--------------------------------------------------+
-| pud_young                 | Tests a young PUD                                |
-+---------------------------+--------------------------------------------------+
-| pud_dirty                 | Tests a dirty PUD                                |
-+---------------------------+--------------------------------------------------+
-| pud_write                 | Tests a writable PUD                             |
-+---------------------------+--------------------------------------------------+
-| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
-+---------------------------+--------------------------------------------------+
-| pud_mkyoung               | Creates a young PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkold                 | Creates an old PUD                               |
-+---------------------------+--------------------------------------------------+
-| pud_mkdirty               | Creates a dirty PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkclean               | Creates a clean PUD                              |
-+---------------------------+--------------------------------------------------+
-| pud_mkwrite               | Creates a writable PUD                           |
-+---------------------------+--------------------------------------------------+
-| pud_wrprotect             | Creates a write protected PUD                    |
-+---------------------------+--------------------------------------------------+
-| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
-+---------------------------+--------------------------------------------------+
-| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
-+---------------------------+--------------------------------------------------+
-| pud_set_huge              | Creates a PUD huge mapping                       |
-+---------------------------+--------------------------------------------------+
-| pud_clear_huge            | Clears a PUD huge mapping                        |
-+---------------------------+--------------------------------------------------+
-| pudp_get_and_clear        | Clears a PUD                                     |
-+---------------------------+--------------------------------------------------+
-| pudp_get_and_clear_full   | Clears a PUD                                     |
-+---------------------------+--------------------------------------------------+
-| pudp_test_and_clear_young | Clears young from a PUD                          |
-+---------------------------+--------------------------------------------------+
-| pudp_set_wrprotect        | Converts into a write protected PUD              |
-+---------------------------+--------------------------------------------------+
-| pudp_set_access_flags     | Converts into a more permissive PUD              |
-+---------------------------+--------------------------------------------------+
-
-
-HugeTLB Page Table Helpers
-==========================
-
-+---------------------------+--------------------------------------------------+
-| pte_huge                  | Tests a HugeTLB                                  |
-+---------------------------+--------------------------------------------------+
-| pte_mkhuge                | Creates a HugeTLB                                |
-+---------------------------+--------------------------------------------------+
-| huge_pte_dirty            | Tests a dirty HugeTLB                            |
-+---------------------------+--------------------------------------------------+
-| huge_pte_write            | Tests a writable HugeTLB                         |
-+---------------------------+--------------------------------------------------+
-| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
-+---------------------------+--------------------------------------------------+
-| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
-+---------------------------+--------------------------------------------------+
-| huge_pte_wrprotect        | Creates a write protected HugeTLB                |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
-+---------------------------+--------------------------------------------------+
-| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
-+---------------------------+--------------------------------------------------+
-
-
-SWAP Page Table Helpers
-========================
-
-+---------------------------+--------------------------------------------------+
-| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
-+---------------------------+--------------------------------------------------+
-| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
-+---------------------------+--------------------------------------------------+
-| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
-+---------------------------+--------------------------------------------------+
-| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
-+---------------------------+--------------------------------------------------+
-| is_migration_entry        | Tests a migration (read or write) swapped entry  |
-+-------------------------------+----------------------------------------------+
-| is_writable_migration_entry   | Tests a write migration swapped entry        |
-+-------------------------------+----------------------------------------------+
-| make_readable_migration_entry | Creates a read migration swapped entry       |
-+-------------------------------+----------------------------------------------+
-| make_writable_migration_entry | Creates a write migration swapped entry      |
-+-------------------------------+----------------------------------------------+
-
-[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst
deleted file mode 100644
index 6a1fadf3e173..000000000000
--- a/Documentation/vm/balance.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-.. _balance:
-
-================
-Memory Balancing
-================
-
-Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
-well as for non __GFP_IO allocations.
-
-The first reason why a caller may avoid reclaim is that the caller can not
-sleep due to holding a spinlock or is in interrupt context. The second may
-be that the caller is willing to fail the allocation without incurring the
-overhead of page reclaim. This may happen for opportunistic high-order
-allocation requests that have order-0 fallback options. In such cases,
-the caller may also wish to avoid waking kswapd.
-
-__GFP_IO allocation requests are made to prevent file system deadlocks.
-
-In the absence of non sleepable allocation requests, it seems detrimental
-to be doing balancing. Page reclamation can be kicked off lazily, that
-is, only when needed (aka zone free memory is 0), instead of making it
-a proactive process.
-
-That being said, the kernel should try to fulfill requests for direct
-mapped pages from the direct mapped pool, instead of falling back on
-the dma pool, so as to keep the dma pool filled for dma requests (atomic
-or not). A similar argument applies to highmem and direct mapped pages.
-OTOH, if there is a lot of free dma pages, it is preferable to satisfy
-regular memory requests by allocating one from the dma pool, instead
-of incurring the overhead of regular zone balancing.
-
-In 2.2, memory balancing/page reclamation would kick off only when the
-_total_ number of free pages fell below 1/64 th of total memory. With the
-right ratio of dma and regular memory, it is quite possible that balancing
-would not be done even when the dma zone was completely empty. 2.2 has
-been running production machines of varying memory sizes, and seems to be
-doing fine even with the presence of this problem. In 2.3, due to
-HIGHMEM, this problem is aggravated.
-
-In 2.3, zone balancing can be done in one of two ways: depending on the
-zone size (and possibly of the size of lower class zones), we can decide
-at init time how many free pages we should aim for while balancing any
-zone. The good part is, while balancing, we do not need to look at sizes
-of lower class zones, the bad part is, we might do too frequent balancing
-due to ignoring possibly lower usage in the lower class zones. Also,
-with a slight change in the allocation routine, it is possible to reduce
-the memclass() macro to be a simple equality.
-
-Another possible solution is that we balance only when the free memory
-of a zone _and_ all its lower class zones falls below 1/64th of the
-total memory in the zone and its lower class zones. This fixes the 2.2
-balancing problem, and stays as close to 2.2 behavior as possible. Also,
-the balancing algorithm works the same way on the various architectures,
-which have different numbers and types of zones. If we wanted to get
-fancy, we could assign different weights to free pages in different
-zones in the future.
-
-Note that if the size of the regular zone is huge compared to dma zone,
-it becomes less significant to consider the free dma pages while
-deciding whether to balance the regular zone. The first solution
-becomes more attractive then.
-
-The appended patch implements the second solution. It also "fixes" two
-problems: first, kswapd is woken up as in 2.2 on low memory conditions
-for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
-so as to give a fighting chance for replace_with_highmem() to get a
-HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
-fall back into regular zone. This also makes sure that HIGHMEM pages
-are not leaked (for example, in situations where a HIGHMEM page is in
-the swapcache but is not being used by anyone)
-
-kswapd also needs to know about the zones it should balance. kswapd is
-primarily needed in a situation where balancing can not be done,
-probably because all allocation requests are coming from intr context
-and all process contexts are sleeping. For 2.3, kswapd does not really
-need to balance the highmem zone, since intr context does not request
-highmem pages. kswapd looks at the zone_wake_kswapd field in the zone
-structure to decide whether a zone needs balancing.
-
-Page stealing from process memory and shm is done if stealing the page would
-alleviate memory pressure on any zone in the page's node that has fallen below
-its watermark.
-
-watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
-are per-zone fields, used to determine when a zone needs to be balanced. When
-the number of pages falls below watermark[WMARK_MIN], the hysteric field
-low_on_memory gets set. This stays set till the number of free pages becomes
-watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
-try to free some pages in the zone (providing GFP_WAIT is set in the request).
-Orthogonal to this, is the decision to poke kswapd to free some zone pages.
-That decision is not hysteresis based, and is done when the number of free
-pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
-
-
-(Good) Ideas that I have heard:
-
-1. Dynamic experience should influence balancing: number of failed requests
-   for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
-2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
-   dma pages. (lkd@tantalophile.demon.co.uk)
diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst
deleted file mode 100644
index eb2b31eedfa1..000000000000
--- a/Documentation/vm/bootmem.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===========
-Boot Memory
-===========
diff --git a/Documentation/vm/damon/api.rst b/Documentation/vm/damon/api.rst
deleted file mode 100644
index 08f34df45523..000000000000
--- a/Documentation/vm/damon/api.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=============
-API Reference
-=============
-
-Kernel space programs can use every feature of DAMON using below APIs.  All you
-need to do is including ``damon.h``, which is located in ``include/linux/`` of
-the source tree.
-
-Structures
-==========
-
-.. kernel-doc:: include/linux/damon.h
-
-
-Functions
-=========
-
-.. kernel-doc:: mm/damon/core.c
diff --git a/Documentation/vm/damon/design.rst b/Documentation/vm/damon/design.rst
deleted file mode 100644
index 0cff6fac6b7e..000000000000
--- a/Documentation/vm/damon/design.rst
+++ /dev/null
@@ -1,176 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======
-Design
-======
-
-Configurable Layers
-===================
-
-DAMON provides data access monitoring functionality while making the accuracy
-and the overhead controllable.  The fundamental access monitorings require
-primitives that dependent on and optimized for the target address space.  On
-the other hand, the accuracy and overhead tradeoff mechanism, which is the core
-of DAMON, is in the pure logic space.  DAMON separates the two parts in
-different layers and defines its interface to allow various low level
-primitives implementations configurable with the core logic.  We call the low
-level primitives implementations monitoring operations.
-
-Due to this separated design and the configurable interface, users can extend
-DAMON for any address space by configuring the core logics with appropriate
-monitoring operations.  If appropriate one is not provided, users can implement
-the operations on their own.
-
-For example, physical memory, virtual memory, swap space, those for specific
-processes, NUMA nodes, files, and backing memory devices would be supportable.
-Also, if some architectures or devices support special optimized access check
-primitives, those will be easily configurable.
-
-
-Reference Implementations of Address Space Specific Monitoring Operations
-=========================================================================
-
-The monitoring operations are defined in two parts:
-
-1. Identification of the monitoring target address range for the address space.
-2. Access check of specific address range in the target space.
-
-DAMON currently provides the implementations of the operations for the physical
-and virtual address spaces. Below two subsections describe how those work.
-
-
-VMA-based Target Address Range Construction
--------------------------------------------
-
-This is only for the virtual address space monitoring operations
-implementation.  That for the physical address space simply asks users to
-manually set the monitoring target address ranges.
-
-Only small parts in the super-huge virtual address space of the processes are
-mapped to the physical memory and accessed.  Thus, tracking the unmapped
-address regions is just wasteful.  However, because DAMON can deal with some
-level of noise using the adaptive regions adjustment mechanism, tracking every
-mapping is not strictly required but could even incur a high overhead in some
-cases.  That said, too huge unmapped areas inside the monitoring target should
-be removed to not take the time for the adaptive mechanism.
-
-For the reason, this implementation converts the complex mappings to three
-distinct regions that cover every mapped area of the address space.  The two
-gaps between the three regions are the two biggest unmapped areas in the given
-address space.  The two biggest unmapped areas would be the gap between the
-heap and the uppermost mmap()-ed region, and the gap between the lowermost
-mmap()-ed region and the stack in most of the cases.  Because these gaps are
-exceptionally huge in usual address spaces, excluding these will be sufficient
-to make a reasonable trade-off.  Below shows this in detail::
-
-    <heap>
-    <BIG UNMAPPED REGION 1>
-    <uppermost mmap()-ed region>
-    (small mmap()-ed regions and munmap()-ed regions)
-    <lowermost mmap()-ed region>
-    <BIG UNMAPPED REGION 2>
-    <stack>
-
-
-PTE Accessed-bit Based Access Check
------------------------------------
-
-Both of the implementations for physical and virtual address spaces use PTE
-Accessed-bit for basic access checks.  Only one difference is the way of
-finding the relevant PTE Accessed bit(s) from the address.  While the
-implementation for the virtual address walks the page table for the target task
-of the address, the implementation for the physical address walks every page
-table having a mapping to the address.  In this way, the implementations find
-and clear the bit(s) for next sampling target address and checks whether the
-bit(s) set again after one sampling period.  This could disturb other kernel
-subsystems using the Accessed bits, namely Idle page tracking and the reclaim
-logic.  DAMON does nothing to avoid disturbing Idle page tracking, so handling
-the interference is the responsibility of sysadmins.  However, it solves the
-conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags,
-as Idle page tracking does.
-
-
-Address Space Independent Core Mechanisms
-=========================================
-
-Below four sections describe each of the DAMON core mechanisms and the five
-monitoring attributes, ``sampling interval``, ``aggregation interval``,
-``update interval``, ``minimum number of regions``, and ``maximum number of
-regions``.
-
-
-Access Frequency Monitoring
----------------------------
-
-The output of DAMON says what pages are how frequently accessed for a given
-duration.  The resolution of the access frequency is controlled by setting
-``sampling interval`` and ``aggregation interval``.  In detail, DAMON checks
-access to each page per ``sampling interval`` and aggregates the results.  In
-other words, counts the number of the accesses to each page.  After each
-``aggregation interval`` passes, DAMON calls callback functions that previously
-registered by users so that users can read the aggregated results and then
-clears the results.  This can be described in below simple pseudo-code::
-
-    while monitoring_on:
-        for page in monitoring_target:
-            if accessed(page):
-                nr_accesses[page] += 1
-        if time() % aggregation_interval == 0:
-            for callback in user_registered_callbacks:
-                callback(monitoring_target, nr_accesses)
-            for page in monitoring_target:
-                nr_accesses[page] = 0
-        sleep(sampling interval)
-
-The monitoring overhead of this mechanism will arbitrarily increase as the
-size of the target workload grows.
-
-
-Region Based Sampling
----------------------
-
-To avoid the unbounded increase of the overhead, DAMON groups adjacent pages
-that assumed to have the same access frequencies into a region.  As long as the
-assumption (pages in a region have the same access frequencies) is kept, only
-one page in the region is required to be checked.  Thus, for each ``sampling
-interval``, DAMON randomly picks one page in each region, waits for one
-``sampling interval``, checks whether the page is accessed meanwhile, and
-increases the access frequency of the region if so.  Therefore, the monitoring
-overhead is controllable by setting the number of regions.  DAMON allows users
-to set the minimum and the maximum number of regions for the trade-off.
-
-This scheme, however, cannot preserve the quality of the output if the
-assumption is not guaranteed.
-
-
-Adaptive Regions Adjustment
----------------------------
-
-Even somehow the initial monitoring target regions are well constructed to
-fulfill the assumption (pages in same region have similar access frequencies),
-the data access pattern can be dynamically changed.  This will result in low
-monitoring quality.  To keep the assumption as much as possible, DAMON
-adaptively merges and splits each region based on their access frequency.
-
-For each ``aggregation interval``, it compares the access frequencies of
-adjacent regions and merges those if the frequency difference is small.  Then,
-after it reports and clears the aggregated access frequency of each region, it
-splits each region into two or three regions if the total number of regions
-will not exceed the user-specified maximum number of regions after the split.
-
-In this way, DAMON provides its best-effort quality and minimal overhead while
-keeping the bounds users set for their trade-off.
-
-
-Dynamic Target Space Updates Handling
--------------------------------------
-
-The monitoring target address range could dynamically changed.  For example,
-virtual memory could be dynamically mapped and unmapped.  Physical memory could
-be hot-plugged.
-
-As the changes could be quite frequent in some cases, DAMON allows the
-monitoring operations to check dynamic changes including memory mapping changes
-and applies it to monitoring operations-related data structures such as the
-abstracted monitoring target memory area only for each of a user-specified time
-interval (``update interval``).
diff --git a/Documentation/vm/damon/faq.rst b/Documentation/vm/damon/faq.rst
deleted file mode 100644
index dde7e2414ee6..000000000000
--- a/Documentation/vm/damon/faq.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========================
-Frequently Asked Questions
-==========================
-
-Why a new subsystem, instead of extending perf or other user space tools?
-=========================================================================
-
-First, because it needs to be lightweight as much as possible so that it can be
-used online, any unnecessary overhead such as kernel - user space context
-switching cost should be avoided.  Second, DAMON aims to be used by other
-programs including the kernel.  Therefore, having a dependency on specific
-tools like perf is not desirable.  These are the two biggest reasons why DAMON
-is implemented in the kernel space.
-
-
-Can 'idle pages tracking' or 'perf mem' substitute DAMON?
-=========================================================
-
-Idle page tracking is a low level primitive for access check of the physical
-address space.  'perf mem' is similar, though it can use sampling to minimize
-the overhead.  On the other hand, DAMON is a higher-level framework for the
-monitoring of various address spaces.  It is focused on memory management
-optimization and provides sophisticated accuracy/overhead handling mechanisms.
-Therefore, 'idle pages tracking' and 'perf mem' could provide a subset of
-DAMON's output, but cannot substitute DAMON.
-
-
-Does DAMON support virtual memory only?
-=======================================
-
-No.  The core of the DAMON is address space independent.  The address space
-specific monitoring operations including monitoring target regions
-constructions and actual access checks can be implemented and configured on the
-DAMON core by the users.  In this way, DAMON users can monitor any address
-space with any access check technique.
-
-Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
-implementations of the address space dependent functions for the virtual memory
-and the physical memory by default, for a reference and convenient use.
-
-
-Can I simply monitor page granularity?
-======================================
-
-Yes.  You can do so by setting the ``min_nr_regions`` attribute higher than the
-working set size divided by the page size.  Because the monitoring target
-regions size is forced to be ``>=page size``, the region split will make no
-effect.
diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
deleted file mode 100644
index 48c0bbff98b2..000000000000
--- a/Documentation/vm/damon/index.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========================
-DAMON: Data Access MONitor
-==========================
-
-DAMON is a data access monitoring framework subsystem for the Linux kernel.
-The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it
-
- - *accurate* (the monitoring output is useful enough for DRAM level memory
-   management; It might not appropriate for CPU Cache levels, though),
- - *light-weight* (the monitoring overhead is low enough to be applied online),
-   and
- - *scalable* (the upper-bound of the overhead is in constant range regardless
-   of the size of target workloads).
-
-Using this framework, therefore, the kernel's memory management mechanisms can
-make advanced decisions.  Experimental memory management optimization works
-that incurring high data accesses monitoring overhead could implemented again.
-In user space, meanwhile, users who have some special workloads can write
-personalized applications for better understanding and optimizations of their
-workloads and systems.
-
-.. toctree::
-   :maxdepth: 2
-
-   faq
-   design
-   api
diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst
deleted file mode 100644
index 8c05e62d8b2b..000000000000
--- a/Documentation/vm/free_page_reporting.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-.. _free_page_reporting:
-
-=====================
-Free Page Reporting
-=====================
-
-Free page reporting is an API by which a device can register to receive
-lists of pages that are currently unused by the system. This is useful in
-the case of virtualization where a guest is then able to use this data to
-notify the hypervisor that it is no longer using certain pages in memory.
-
-For the driver, typically a balloon driver, to use of this functionality
-it will allocate and initialize a page_reporting_dev_info structure. The
-field within the structure it will populate is the "report" function
-pointer used to process the scatterlist. It must also guarantee that it can
-handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
-call to the function. A call to page_reporting_register will register the
-page reporting interface with the reporting framework assuming no other
-page reporting devices are already registered.
-
-Once registered the page reporting API will begin reporting batches of
-pages to the driver. The API will start reporting pages 2 seconds after
-the interface is registered and will continue to do so 2 seconds after any
-page of a sufficiently high order is freed.
-
-Pages reported will be stored in the scatterlist passed to the reporting
-function with the final entry having the end bit set in entry nent - 1.
-While pages are being processed by the report function they will not be
-accessible to the allocator. Once the report function has been completed
-the pages will be returned to the free area from which they were obtained.
-
-Prior to removing a driver that is making use of free page reporting it
-is necessary to call page_reporting_unregister to have the
-page_reporting_dev_info structure that is currently in use by free page
-reporting removed. Doing this will prevent further reports from being
-issued via the interface. If another driver or the same driver is
-registered it is possible for it to resume where the previous driver had
-left off in terms of reporting free pages.
-
-Alexander Duyck, Dec 04, 2019
diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst
deleted file mode 100644
index feecc5e24477..000000000000
--- a/Documentation/vm/frontswap.rst
+++ /dev/null
@@ -1,266 +0,0 @@
-.. _frontswap:
-
-=========
-Frontswap
-=========
-
-Frontswap provides a "transcendent memory" interface for swap pages.
-In some environments, dramatic performance savings may be obtained because
-swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
-
-.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
-
-Frontswap is so named because it can be thought of as the opposite of
-a "backing" store for a swap device.  The storage is assumed to be
-a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
-to the requirements of transcendent memory (such as Xen's "tmem", or
-in-kernel compressed memory, aka "zcache", or future RAM-like devices);
-this pseudo-RAM device is not directly accessible or addressable by the
-kernel and is of unknown and possibly time-varying size.  The driver
-links itself to frontswap by calling frontswap_register_ops to set the
-frontswap_ops funcs appropriately and the functions it provides must
-conform to certain policies as follows:
-
-An "init" prepares the device to receive frontswap pages associated
-with the specified swap device number (aka "type").  A "store" will
-copy the page to transcendent memory and associate it with the type and
-offset associated with the page. A "load" will copy the page, if found,
-from transcendent memory into kernel memory, but will NOT remove the page
-from transcendent memory.  An "invalidate_page" will remove the page
-from transcendent memory and an "invalidate_area" will remove ALL pages
-associated with the swap type (e.g., like swapoff) and notify the "device"
-to refuse further stores with that swap type.
-
-Once a page is successfully stored, a matching load on the page will normally
-succeed.  So when the kernel finds itself in a situation where it needs
-to swap out a page, it first attempts to use frontswap.  If the store returns
-success, the data has been successfully saved to transcendent memory and
-a disk write and, if the data is later read back, a disk read are avoided.
-If a store returns failure, transcendent memory has rejected the data, and the
-page can be written to swap as usual.
-
-Note that if a page is stored and the page already exists in transcendent memory
-(a "duplicate" store), either the store succeeds and the data is overwritten,
-or the store fails AND the page is invalidated.  This ensures stale data may
-never be obtained from frontswap.
-
-If properly configured, monitoring of frontswap is done via debugfs in
-the `/sys/kernel/debug/frontswap` directory.  The effectiveness of
-frontswap can be measured (across all swap devices) with:
-
-``failed_stores``
-	how many store attempts have failed
-
-``loads``
-	how many loads were attempted (all should succeed)
-
-``succ_stores``
-	how many store attempts have succeeded
-
-``invalidates``
-	how many invalidates were attempted
-
-A backend implementation may provide additional metrics.
-
-FAQ
-===
-
-* Where's the value?
-
-When a workload starts swapping, performance falls through the floor.
-Frontswap significantly increases performance in many such workloads by
-providing a clean, dynamic interface to read and write swap pages to
-"transcendent memory" that is otherwise not directly addressable to the kernel.
-This interface is ideal when data is transformed to a different form
-and size (such as with compression) or secretly moved (as might be
-useful for write-balancing for some RAM-like devices).  Swap pages (and
-evicted page-cache pages) are a great use for this kind of slower-than-RAM-
-but-much-faster-than-disk "pseudo-RAM device".
-
-Frontswap with a fairly small impact on the kernel,
-provides a huge amount of flexibility for more dynamic, flexible RAM
-utilization in various system configurations:
-
-In the single kernel case, aka "zcache", pages are compressed and
-stored in local memory, thus increasing the total anonymous pages
-that can be safely kept in RAM.  Zcache essentially trades off CPU
-cycles used in compression/decompression for better memory utilization.
-Benchmarks have shown little or no impact when memory pressure is
-low while providing a significant performance improvement (25%+)
-on some workloads under high memory pressure.
-
-"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory
-support for clustered systems.  Frontswap pages are locally compressed
-as in zcache, but then "remotified" to another system's RAM.  This
-allows RAM to be dynamically load-balanced back-and-forth as needed,
-i.e. when system A is overcommitted, it can swap to system B, and
-vice versa.  RAMster can also be configured as a memory server so
-many servers in a cluster can swap, dynamically as needed, to a single
-server configured with a large amount of RAM... without pre-configuring
-how much of the RAM is available for each of the clients!
-
-In the virtual case, the whole point of virtualization is to statistically
-multiplex physical resources across the varying demands of multiple
-virtual machines.  This is really hard to do with RAM and efforts to do
-it well with no kernel changes have essentially failed (except in some
-well-publicized special-case workloads).
-Specifically, the Xen Transcendent Memory backend allows otherwise
-"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
-virtual machines, but the pages can be compressed and deduplicated to
-optimize RAM utilization.  And when guest OS's are induced to surrender
-underutilized RAM (e.g. with "selfballooning"), sudden unexpected
-memory pressure may result in swapping; frontswap allows those pages
-to be swapped to and from hypervisor RAM (if overall host system memory
-conditions allow), thus mitigating the potentially awful performance impact
-of unplanned swapping.
-
-A KVM implementation is underway and has been RFC'ed to lkml.  And,
-using frontswap, investigation is also underway on the use of NVM as
-a memory extension technology.
-
-* Sure there may be performance advantages in some situations, but
-  what's the space/time overhead of frontswap?
-
-If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
-nothingness and the only overhead is a few extra bytes per swapon'ed
-swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
-registers, there is one extra global variable compared to zero for
-every swap page read or written.  If CONFIG_FRONTSWAP is enabled
-AND a frontswap backend registers AND the backend fails every "store"
-request (i.e. provides no memory despite claiming it might),
-CPU overhead is still negligible -- and since every frontswap fail
-precedes a swap page write-to-disk, the system is highly likely
-to be I/O bound and using a small fraction of a percent of a CPU
-will be irrelevant anyway.
-
-As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
-registers, one bit is allocated for every swap page for every swap
-device that is swapon'd.  This is added to the EIGHT bits (which
-was sixteen until about 2.6.34) that the kernel already allocates
-for every swap page for every swap device that is swapon'd.  (Hugh
-Dickins has observed that frontswap could probably steal one of
-the existing eight bits, but let's worry about that minor optimization
-later.)  For very large swap disks (which are rare) on a standard
-4K pagesize, this is 1MB per 32GB swap.
-
-When swap pages are stored in transcendent memory instead of written
-out to disk, there is a side effect that this may create more memory
-pressure that can potentially outweigh the other advantages.  A
-backend, such as zcache, must implement policies to carefully (but
-dynamically) manage memory limits to ensure this doesn't happen.
-
-* OK, how about a quick overview of what this frontswap patch does
-  in terms that a kernel hacker can grok?
-
-Let's assume that a frontswap "backend" has registered during
-kernel initialization; this registration indicates that this
-frontswap backend has access to some "memory" that is not directly
-accessible by the kernel.  Exactly how much memory it provides is
-entirely dynamic and random.
-
-Whenever a swap-device is swapon'd frontswap_init() is called,
-passing the swap device number (aka "type") as a parameter.
-This notifies frontswap to expect attempts to "store" swap pages
-associated with that number.
-
-Whenever the swap subsystem is readying a page to write to a swap
-device (c.f swap_writepage()), frontswap_store is called.  Frontswap
-consults with the frontswap backend and if the backend says it does NOT
-have room, frontswap_store returns -1 and the kernel swaps the page
-to the swap device as normal.  Note that the response from the frontswap
-backend is unpredictable to the kernel; it may choose to never accept a
-page, it could accept every ninth page, or it might accept every
-page.  But if the backend does accept a page, the data from the page
-has already been copied and associated with the type and offset,
-and the backend guarantees the persistence of the data.  In this case,
-frontswap sets a bit in the "frontswap_map" for the swap device
-corresponding to the page offset on the swap device to which it would
-otherwise have written the data.
-
-When the swap subsystem needs to swap-in a page (swap_readpage()),
-it first calls frontswap_load() which checks the frontswap_map to
-see if the page was earlier accepted by the frontswap backend.  If
-it was, the page of data is filled from the frontswap backend and
-the swap-in is complete.  If not, the normal swap-in code is
-executed to obtain the page of data from the real swap device.
-
-So every time the frontswap backend accepts a page, a swap device read
-and (potentially) a swap device write are replaced by a "frontswap backend
-store" and (possibly) a "frontswap backend loads", which are presumably much
-faster.
-
-* Can't frontswap be configured as a "special" swap device that is
-  just higher priority than any real swap device (e.g. like zswap,
-  or maybe swap-over-nbd/NFS)?
-
-No.  First, the existing swap subsystem doesn't allow for any kind of
-swap hierarchy.  Perhaps it could be rewritten to accommodate a hierarchy,
-but this would require fairly drastic changes.  Even if it were
-rewritten, the existing swap subsystem uses the block I/O layer which
-assumes a swap device is fixed size and any page in it is linearly
-addressable.  Frontswap barely touches the existing swap subsystem,
-and works around the constraints of the block I/O subsystem to provide
-a great deal of flexibility and dynamicity.
-
-For example, the acceptance of any swap page by the frontswap backend is
-entirely unpredictable. This is critical to the definition of frontswap
-backends because it grants completely dynamic discretion to the
-backend.  In zcache, one cannot know a priori how compressible a page is.
-"Poorly" compressible pages can be rejected, and "poorly" can itself be
-defined dynamically depending on current memory constraints.
-
-Further, frontswap is entirely synchronous whereas a real swap
-device is, by definition, asynchronous and uses block I/O.  The
-block I/O layer is not only unnecessary, but may perform "optimizations"
-that are inappropriate for a RAM-oriented device including delaying
-the write of some pages for a significant amount of time.  Synchrony is
-required to ensure the dynamicity of the backend and to avoid thorny race
-conditions that would unnecessarily and greatly complicate frontswap
-and/or the block I/O subsystem.  That said, only the initial "store"
-and "load" operations need be synchronous.  A separate asynchronous thread
-is free to manipulate the pages stored by frontswap.  For example,
-the "remotification" thread in RAMster uses standard asynchronous
-kernel sockets to move compressed frontswap pages to a remote machine.
-Similarly, a KVM guest-side implementation could do in-guest compression
-and use "batched" hypercalls.
-
-In a virtualized environment, the dynamicity allows the hypervisor
-(or host OS) to do "intelligent overcommit".  For example, it can
-choose to accept pages only until host-swapping might be imminent,
-then force guests to do their own swapping.
-
-There is a downside to the transcendent memory specifications for
-frontswap:  Since any "store" might fail, there must always be a real
-slot on a real swap device to swap the page.  Thus frontswap must be
-implemented as a "shadow" to every swapon'd device with the potential
-capability of holding every page that the swap device might have held
-and the possibility that it might hold no pages at all.  This means
-that frontswap cannot contain more pages than the total of swapon'd
-swap devices.  For example, if NO swap device is configured on some
-installation, frontswap is useless.  Swapless portable devices
-can still use frontswap but a backend for such devices must configure
-some kind of "ghost" swap device and ensure that it is never used.
-
-* Why this weird definition about "duplicate stores"?  If a page
-  has been previously successfully stored, can't it always be
-  successfully overwritten?
-
-Nearly always it can, but no, sometimes it cannot.  Consider an example
-where data is compressed and the original 4K page has been compressed
-to 1K.  Now an attempt is made to overwrite the page with data that
-is non-compressible and so would take the entire 4K.  But the backend
-has no more space.  In this case, the store must be rejected.  Whenever
-frontswap rejects a store that would overwrite, it also must invalidate
-the old data and ensure that it is no longer accessible.  Since the
-swap subsystem then writes the new data to the read swap device,
-this is the correct course of action to ensure coherency.
-
-* Why does the frontswap patch create the new include file swapfile.h?
-
-The frontswap code depends on some swap-subsystem-internal data
-structures that have, over the years, moved back and forth between
-static and global.  This seemed a reasonable compromise:  Define
-them as global but declare them in a new include file that isn't
-included by the large number of source files that include swap.h.
-
-Dan Magenheimer, last updated April 9, 2012
diff --git a/Documentation/vm/highmem.rst b/Documentation/vm/highmem.rst
deleted file mode 100644
index c9887f241c6c..000000000000
--- a/Documentation/vm/highmem.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _highmem:
-
-====================
-High Memory Handling
-====================
-
-By: Peter Zijlstra <a.p.zijlstra@chello.nl>
-
-.. contents:: :local:
-
-What Is High Memory?
-====================
-
-High memory (highmem) is used when the size of physical memory approaches or
-exceeds the maximum size of virtual memory.  At that point it becomes
-impossible for the kernel to keep all of the available physical memory mapped
-at all times.  This means the kernel needs to start using temporary mappings of
-the pieces of physical memory that it wants to access.
-
-The part of (physical) memory not covered by a permanent mapping is what we
-refer to as 'highmem'.  There are various architecture dependent constraints on
-where exactly that border lies.
-
-In the i386 arch, for example, we choose to map the kernel into every process's
-VM space so that we don't have to pay the full TLB invalidation costs for
-kernel entry/exit.  This means the available virtual memory space (4GiB on
-i386) has to be divided between user and kernel space.
-
-The traditional split for architectures using this approach is 3:1, 3GiB for
-userspace and the top 1GiB for kernel space::
-
-		+--------+ 0xffffffff
-		| Kernel |
-		+--------+ 0xc0000000
-		|        |
-		| User   |
-		|        |
-		+--------+ 0x00000000
-
-This means that the kernel can at most map 1GiB of physical memory at any one
-time, but because we need virtual address space for other things - including
-temporary maps to access the rest of the physical memory - the actual direct
-map will typically be less (usually around ~896MiB).
-
-Other architectures that have mm context tagged TLBs can have separate kernel
-and user maps.  Some hardware (like some ARMs), however, have limited virtual
-space when they use mm context tags.
-
-
-Temporary Virtual Mappings
-==========================
-
-The kernel contains several ways of creating temporary mappings. The following
-list shows them in order of preference of use.
-
-* kmap_local_page().  This function is used to require short term mappings.
-  It can be invoked from any context (including interrupts) but the mappings
-  can only be used in the context which acquired them.
-
-  This function should be preferred, where feasible, over all the others.
-
-  These mappings are thread-local and CPU-local, meaning that the mapping
-  can only be accessed from within this thread and the thread is bound the
-  CPU while the mapping is active. Even if the thread is preempted (since
-  preemption is never disabled by the function) the CPU can not be
-  unplugged from the system via CPU-hotplug until the mapping is disposed.
-
-  It's valid to take pagefaults in a local kmap region, unless the context
-  in which the local mapping is acquired does not allow it for other reasons.
-
-  kmap_local_page() always returns a valid virtual address and it is assumed
-  that kunmap_local() will never fail.
-
-  Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
-  extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
-  because the map implementation is stack based. See kmap_local_page() kdocs
-  (included in the "Functions" section) for details on how to manage nested
-  mappings.
-
-* kmap_atomic().  This permits a very short duration mapping of a single
-  page.  Since the mapping is restricted to the CPU that issued it, it
-  performs well, but the issuing task is therefore required to stay on that
-  CPU until it has finished, lest some other task displace its mappings.
-
-  kmap_atomic() may also be used by interrupt contexts, since it does not
-  sleep and the callers too may not sleep until after kunmap_atomic() is
-  called.
-
-  Each call of kmap_atomic() in the kernel creates a non-preemptible section
-  and disable pagefaults. This could be a source of unwanted latency. Therefore
-  users should prefer kmap_local_page() instead of kmap_atomic().
-
-  It is assumed that k[un]map_atomic() won't fail.
-
-* kmap().  This should be used to make short duration mapping of a single
-  page with no restrictions on preemption or migration. It comes with an
-  overhead as mapping space is restricted and protected by a global lock
-  for synchronization. When mapping is no longer needed, the address that
-  the page was mapped to must be released with kunmap().
-
-  Mapping changes must be propagated across all the CPUs. kmap() also
-  requires global TLB invalidation when the kmap's pool wraps and it might
-  block when the mapping space is fully utilized until a slot becomes
-  available. Therefore, kmap() is only callable from preemptible context.
-
-  All the above work is necessary if a mapping must last for a relatively
-  long time but the bulk of high-memory mappings in the kernel are
-  short-lived and only used in one place. This means that the cost of
-  kmap() is mostly wasted in such cases. kmap() was not intended for long
-  term mappings but it has morphed in that direction and its use is
-  strongly discouraged in newer code and the set of the preceding functions
-  should be preferred.
-
-  On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have
-  no real work to do because a 64-bit address space is more than sufficient to
-  address all the physical memory whose pages are permanently mapped.
-
-* vmap().  This can be used to make a long duration mapping of multiple
-  physical pages into a contiguous virtual space.  It needs global
-  synchronization to unmap.
-
-
-Cost of Temporary Mappings
-==========================
-
-The cost of creating temporary mappings can be quite high.  The arch has to
-manipulate the kernel's page tables, the data TLB and/or the MMU's registers.
-
-If CONFIG_HIGHMEM is not set, then the kernel will try and create a mapping
-simply with a bit of arithmetic that will convert the page struct address into
-a pointer to the page contents rather than juggling mappings about.  In such a
-case, the unmap operation may be a null operation.
-
-If CONFIG_MMU is not set, then there can be no temporary mappings and no
-highmem.  In such a case, the arithmetic approach will also be used.
-
-
-i386 PAE
-========
-
-The i386 arch, under some circumstances, will permit you to stick up to 64GiB
-of RAM into your 32-bit machine.  This has a number of consequences:
-
-* Linux needs a page-frame structure for each page in the system and the
-  pageframes need to live in the permanent mapping, which means:
-
-* you can have 896M/sizeof(struct page) page-frames at most; with struct
-  page being 32-bytes that would end up being something in the order of 112G
-  worth of pages; the kernel, however, needs to store more than just
-  page-frames in that memory...
-
-* PAE makes your page tables larger - which slows the system down as more
-  data has to be accessed to traverse in TLB fills and the like.  One
-  advantage is that PAE has more PTE bits and can provide advanced features
-  like NX and PAT.
-
-The general recommendation is that you don't use more than 8GiB on a 32-bit
-machine - although more might work for you and your workload, you're pretty
-much on your own - don't expect kernel developers to really care much if things
-come apart.
-
-
-Functions
-=========
-
-.. kernel-doc:: include/linux/highmem.h
-.. kernel-doc:: include/linux/highmem-internal.h
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
deleted file mode 100644
index f2a59ed82ed3..000000000000
--- a/Documentation/vm/hmm.rst
+++ /dev/null
@@ -1,452 +0,0 @@
-.. _hmm:
-
-=====================================
-Heterogeneous Memory Management (HMM)
-=====================================
-
-Provide infrastructure and helpers to integrate non-conventional memory (device
-memory like GPU on board memory) into regular kernel path, with the cornerstone
-of this being specialized struct page for such memory (see sections 5 to 7 of
-this document).
-
-HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program addresses coherently with
-the CPU meaning that any valid pointer on the CPU is also a valid pointer
-for the device. This is becoming mandatory to simplify the use of advanced
-heterogeneous computing where GPU, DSP, or FPGA are used to perform various
-computations on behalf of a process.
-
-This document is divided as follows: in the first section I expose the problems
-related to using device specific memory allocators. In the second section, I
-expose the hardware limitations that are inherent to many platforms. The third
-section gives an overview of the HMM design. The fourth section explains how
-CPU page-table mirroring works and the purpose of HMM in this context. The
-fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows
-leveraging the device DMA engine.
-
-.. contents:: :local:
-
-Problems of using a device specific memory allocator
-====================================================
-
-Devices with a large amount of on board memory (several gigabytes) like GPUs
-have historically managed their memory through dedicated driver specific APIs.
-This creates a disconnect between memory allocated and managed by a device
-driver and regular application memory (private anonymous, shared memory, or
-regular file backed memory). From here on I will refer to this aspect as split
-address space. I use shared address space to refer to the opposite situation:
-i.e., one in which any application memory region can be used by a device
-transparently.
-
-Split address space happens because devices can only access memory allocated
-through a device specific API. This implies that all memory objects in a program
-are not equal from the device point of view which complicates large programs
-that rely on a wide set of libraries.
-
-Concretely, this means that code that wants to leverage devices like GPUs needs
-to copy objects between generically allocated memory (malloc, mmap private, mmap
-share) and memory allocated through the device driver API (this still ends up
-with an mmap but of the device file).
-
-For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
-complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and programs get harder to debug because of the
-duplicate data set and addresses.
-
-Split address space also means that libraries cannot transparently use data
-they are getting from the core program or another library and thus each library
-might have to duplicate its input data set using the device specific memory
-allocator. Large projects suffer from this and waste resources because of the
-various memory copies.
-
-Duplicating each library API to accept as input or output memory allocated by
-each device specific allocator is not a viable option. It would lead to a
-combinatorial explosion in the library entry points.
-
-Finally, with the advance of high level language constructs (in C++ but in
-other languages too) it is now possible for the compiler to leverage GPUs and
-other devices without programmer knowledge. Some compiler identified patterns
-are only do-able with a shared address space. It is also more reasonable to use
-a shared address space for all other patterns.
-
-
-I/O bus, device memory characteristics
-======================================
-
-I/O buses cripple shared address spaces due to a few limitations. Most I/O
-buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from a CPU is even more
-limited. More often than not, it is not cache coherent.
-
-If we only consider the PCIE bus, then a device can access main memory (often
-through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from the device on main memory. This is worse
-in the other direction: the CPU can only access a limited range of the device
-memory and cannot perform atomic operations on it. Thus device memory cannot
-be considered the same as regular memory from the kernel point of view.
-
-Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
-and 16 lanes). This is 33 times less than the fastest GPU memory (1 TBytes/s).
-The final limitation is latency. Access to main memory from the device has an
-order of magnitude higher latency than when the device accesses its own memory.
-
-Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow
-two-way cache coherency between CPU and device and allow all atomic operations the
-architecture supports. Sadly, not all platforms are following this trend and
-some major architectures are left without hardware solutions to these problems.
-
-So for shared address space to make sense, not only must we allow devices to
-access any memory but we must also permit any memory to be migrated to device
-memory while the device is using it (blocking CPU access while it happens).
-
-
-Shared address space and migration
-==================================
-
-HMM intends to provide two main features. The first one is to share the address
-space by duplicating the CPU page table in the device page table so the same
-address points to the same physical memory for any valid main memory address in
-the process address space.
-
-To achieve this, HMM offers a set of helpers to populate the device page table
-while keeping track of CPU page table updates. Device page table updates are
-not as easy as CPU page table updates. To update the device page table, you must
-allocate a buffer (or use a pool of pre-allocated buffers) and write GPU
-specific commands in it to perform the update (unmap, cache invalidations, and
-flush, ...). This cannot be done through common code for all devices. Hence
-why HMM provides helpers to factor out everything that can be while leaving the
-hardware specific details to the device driver.
-
-The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of device memory. Those pages
-are special because the CPU cannot map them. However, they allow migrating
-main memory to device memory using existing migration mechanisms and everything
-looks like a page that is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm
-mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
-memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate is left to the device driver.
-
-Note that any CPU access to a device page triggers a page fault and a migration
-back to main memory. For example, when a page backing a given CPU address A is
-migrated from a main memory page to a device page, then any CPU access to
-address A triggers a page fault and initiates a migration back to main memory.
-
-With these two features, HMM not only allows a device to mirror process address
-space and keeps both CPU and device page tables synchronized, but also
-leverages device memory by migrating the part of the data set that is actively being
-used by the device.
-
-
-Address space mirroring implementation and API
-==============================================
-
-Address space mirroring's main objective is to allow duplication of a range of
-CPU page table into a device page table; HMM helps keep both synchronized. A
-device driver that wants to mirror a process address space must start with the
-registration of a mmu_interval_notifier::
-
- int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
-				  struct mm_struct *mm, unsigned long start,
-				  unsigned long length,
-				  const struct mmu_interval_notifier_ops *ops);
-
-During the ops->invalidate() callback the device driver must perform the
-update action to the range (mark range read only, or fully unmap, etc.). The
-device must complete the update before the driver callback returns.
-
-When the device driver wants to populate a range of virtual addresses, it can
-use::
-
-  int hmm_range_fault(struct hmm_range *range);
-
-It will trigger a page fault on missing or read-only entries if write access is
-requested (see below). Page faults use the generic mm page fault code path just
-like a CPU page fault.
-
-Both functions copy CPU page table entries into their pfns array argument. Each
-entry in that array corresponds to an address in the virtual range. HMM
-provides a set of flags to help the driver identify special CPU page table
-entries.
-
-Locking within the sync_cpu_device_pagetables() callback is the most important
-aspect the driver must respect in order to keep things properly synchronized.
-The usage pattern is::
-
- int driver_populate_range(...)
- {
-      struct hmm_range range;
-      ...
-
-      range.notifier = &interval_sub;
-      range.start = ...;
-      range.end = ...;
-      range.hmm_pfns = ...;
-
-      if (!mmget_not_zero(interval_sub->notifier.mm))
-          return -EFAULT;
-
- again:
-      range.notifier_seq = mmu_interval_read_begin(&interval_sub);
-      mmap_read_lock(mm);
-      ret = hmm_range_fault(&range);
-      if (ret) {
-          mmap_read_unlock(mm);
-          if (ret == -EBUSY)
-                 goto again;
-          return ret;
-      }
-      mmap_read_unlock(mm);
-
-      take_lock(driver->update);
-      if (mmu_interval_read_retry(&ni, range.notifier_seq) {
-          release_lock(driver->update);
-          goto again;
-      }
-
-      /* Use pfns array content to update device page table,
-       * under the update lock */
-
-      release_lock(driver->update);
-      return 0;
- }
-
-The driver->update lock is the same lock that the driver takes inside its
-invalidate() callback. That lock must be held before calling
-mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
-update.
-
-Leverage default_flags and pfn_flags_mask
-=========================================
-
-The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
-fault or snapshot policy for the whole range instead of having to set them
-for each entry in the pfns array.
-
-For instance if the device driver wants pages for a range with at least read
-permission, it sets::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = 0;
-
-and calls hmm_range_fault() as described above. This will fill fault all pages
-in the range with at least read permission.
-
-Now let's say the driver wants to do the same except for one page in the range for
-which it wants to have write permission. Now driver set::
-
-    range->default_flags = HMM_PFN_REQ_FAULT;
-    range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
-    range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
-
-With this, HMM will fault in all pages with at least read (i.e., valid) and for the
-address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission i.e., if the CPU pte does not have write permission set then HMM
-will call handle_mm_fault().
-
-After hmm_range_fault completes the flag bits are set to the current state of
-the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
-writable.
-
-
-Represent and manage device memory from core kernel point of view
-=================================================================
-
-Several different designs were tried to support device memory. The first one
-used a device specific data structure to keep information about migrated memory
-and HMM hooked itself in various places of mm code to handle any access to
-addresses that were backed by device memory. It turns out that this ended up
-replicating most of the fields of struct page and also needed many kernel code
-paths to be updated to understand this new kind of memory.
-
-Most kernel code paths never try to access the memory behind a page
-but only care about struct page contents. Because of this, HMM switched to
-directly using struct page for device memory which left most kernel code paths
-unaware of the difference. We only need to make sure that no one ever tries to
-map those pages from the CPU side.
-
-Migration to and from device memory
-===================================
-
-Because the CPU cannot access device memory directly, the device driver must
-use hardware DMA or device specific load/store instructions to migrate data.
-The migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
-functions are designed to make drivers easier to write and to centralize common
-code across drivers.
-
-Before migrating pages to device private memory, special device private
-``struct page`` need to be created. These will be used as special "swap"
-page table entries so that a CPU process will fault if it tries to access
-a page that has been migrated to device private memory.
-
-These can be allocated and freed with::
-
-    struct resource *res;
-    struct dev_pagemap pagemap;
-
-    res = request_free_mem_region(&iomem_resource, /* number of bytes */,
-                                  "name of driver resource");
-    pagemap.type = MEMORY_DEVICE_PRIVATE;
-    pagemap.range.start = res->start;
-    pagemap.range.end = res->end;
-    pagemap.nr_range = 1;
-    pagemap.ops = &device_devmem_ops;
-    memremap_pages(&pagemap, numa_node_id());
-
-    memunmap_pages(&pagemap);
-    release_mem_region(pagemap.range.start, range_len(&pagemap.range));
-
-There are also devm_request_free_mem_region(), devm_memremap_pages(),
-devm_memunmap_pages(), and devm_release_mem_region() when the resources can
-be tied to a ``struct device``.
-
-The overall migration steps are similar to migrating NUMA pages within system
-memory (see :ref:`Page migration <page_migration>`) but the steps are split
-between device driver specific code and shared common code:
-
-1. ``mmap_read_lock()``
-
-   The device driver has to pass a ``struct vm_area_struct`` to
-   migrate_vma_setup() so the mmap_read_lock() or mmap_write_lock() needs to
-   be held for the duration of the migration.
-
-2. ``migrate_vma_setup(struct migrate_vma *args)``
-
-   The device driver initializes the ``struct migrate_vma`` fields and passes
-   the pointer to migrate_vma_setup(). The ``args->flags`` field is used to
-   filter which source pages should be migrated. For example, setting
-   ``MIGRATE_VMA_SELECT_SYSTEM`` will only migrate system memory and
-   ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` will only migrate pages residing in
-   device private memory. If the latter flag is set, the ``args->pgmap_owner``
-   field is used to identify device private pages owned by the driver. This
-   avoids trying to migrate device private pages residing in other devices.
-   Currently only anonymous private VMA ranges can be migrated to or from
-   system memory and device private memory.
-
-   One of the first steps migrate_vma_setup() does is to invalidate other
-   device's MMUs with the ``mmu_notifier_invalidate_range_start(()`` and
-   ``mmu_notifier_invalidate_range_end()`` calls around the page table
-   walks to fill in the ``args->src`` array with PFNs to be migrated.
-   The ``invalidate_range_start()`` callback is passed a
-   ``struct mmu_notifier_range`` with the ``event`` field set to
-   ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to
-   the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is
-   allows the device driver to skip the invalidation callback and only
-   invalidate device private MMU mappings that are actually migrating.
-   This is explained more in the next section.
-
-   While walking the page tables, a ``pte_none()`` or ``is_zero_pfn()``
-   entry results in a valid "zero" PFN stored in the ``args->src`` array.
-   This lets the driver allocate device private memory and clear it instead
-   of copying a page of zeros. Valid PTE entries to system memory or
-   device private struct pages will be locked with ``lock_page()``, isolated
-   from the LRU (if system memory since device private pages are not on
-   the LRU), unmapped from the process, and a special migration PTE is
-   inserted in place of the original PTE.
-   migrate_vma_setup() also clears the ``args->dst`` array.
-
-3. The device driver allocates destination pages and copies source pages to
-   destination pages.
-
-   The driver checks each ``src`` entry to see if the ``MIGRATE_PFN_MIGRATE``
-   bit is set and skips entries that are not migrating. The device driver
-   can also choose to skip migrating a page by not filling in the ``dst``
-   array for that page.
-
-   The driver then allocates either a device private struct page or a
-   system memory page, locks the page with ``lock_page()``, and fills in the
-   ``dst`` array entry with::
-
-     dst[i] = migrate_pfn(page_to_pfn(dpage));
-
-   Now that the driver knows that this page is being migrated, it can
-   invalidate device private MMU mappings and copy device private memory
-   to system memory or another device private page. The core Linux kernel
-   handles CPU page table invalidations so the device driver only has to
-   invalidate its own MMU mappings.
-
-   The driver can use ``migrate_pfn_to_page(src[i])`` to get the
-   ``struct page`` of the source and either copy the source page to the
-   destination or clear the destination device private memory if the pointer
-   is ``NULL`` meaning the source page was not populated in system memory.
-
-4. ``migrate_vma_pages()``
-
-   This step is where the migration is actually "committed".
-
-   If the source page was a ``pte_none()`` or ``is_zero_pfn()`` page, this
-   is where the newly allocated page is inserted into the CPU's page table.
-   This can fail if a CPU thread faults on the same page. However, the page
-   table is locked and only one of the new pages will be inserted.
-   The device driver will see that the ``MIGRATE_PFN_MIGRATE`` bit is cleared
-   if it loses the race.
-
-   If the source page was locked, isolated, etc. the source ``struct page``
-   information is now copied to destination ``struct page`` finalizing the
-   migration on the CPU side.
-
-5. Device driver updates device MMU page tables for pages still migrating,
-   rolling back pages not migrating.
-
-   If the ``src`` entry still has ``MIGRATE_PFN_MIGRATE`` bit set, the device
-   driver can update the device MMU and set the write enable bit if the
-   ``MIGRATE_PFN_WRITE`` bit is set.
-
-6. ``migrate_vma_finalize()``
-
-   This step replaces the special migration page table entry with the new
-   page's page table entry and releases the reference to the source and
-   destination ``struct page``.
-
-7. ``mmap_read_unlock()``
-
-   The lock can now be released.
-
-Exclusive access memory
-=======================
-
-Some devices have features such as atomic PTE bits that can be used to implement
-atomic access to system memory. To support atomic operations to a shared virtual
-memory page such a device needs access to that page which is exclusive of any
-userspace access from the CPU. The ``make_device_exclusive_range()`` function
-can be used to make a memory range inaccessible from userspace.
-
-This replaces all mappings for pages in the given range with special swap
-entries. Any attempt to access the swap entry results in a fault which is
-resovled by replacing the entry with the original mapping. A driver gets
-notified that the mapping has been changed by MMU notifiers, after which point
-it will no longer have exclusive access to the page. Exclusive access is
-guranteed to last until the driver drops the page lock and page reference, at
-which point any CPU faults on the page may proceed as described.
-
-Memory cgroup (memcg) and rss accounting
-========================================
-
-For now, device memory is accounted as any regular page in rss counters (either
-anonymous if device page is used for anonymous, file if device page is used for
-file backed page, or shmem if device page is used for shared memory). This is a
-deliberate choice to keep existing applications, that might start using device
-memory without knowing about it, running unimpacted.
-
-A drawback is that the OOM killer might kill an application using a lot of
-device memory and not a lot of regular system memory and thus not freeing much
-system memory. We want to gather more real world experience on how applications
-and system react under memory pressure in the presence of device memory before
-deciding to account device memory differently.
-
-
-Same decision was made for memory cgroup. Device memory pages are accounted
-against same memory cgroup a regular page would be accounted to. This does
-simplify migration to and from device memory. This also means that migration
-back from device memory to regular memory cannot fail because it would
-go above memory cgroup limit. We might revisit this choice latter on once we
-get more experience in how device memory is used and its impact on memory
-resource control.
-
-
-Note that device memory can never be pinned by a device driver nor through GUP
-and thus such memory is always free upon process exit. Or when last reference
-is dropped in case of shared memory or file backed memory.
diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/vm/hugetlbfs_reserv.rst
deleted file mode 100644
index f143954e0d05..000000000000
--- a/Documentation/vm/hugetlbfs_reserv.rst
+++ /dev/null
@@ -1,596 +0,0 @@
-.. _hugetlbfs_reserve:
-
-=====================
-Hugetlbfs Reservation
-=====================
-
-Overview
-========
-
-Huge pages as described at :ref:`hugetlbpage` are typically
-preallocated for application use.  These huge pages are instantiated in a
-task's address space at page fault time if the VMA indicates huge pages are
-to be used.  If no huge page exists at page fault time, the task is sent
-a SIGBUS and often dies an unhappy death.  Shortly after huge page support
-was added, it was determined that it would be better to detect a shortage
-of huge pages at mmap() time.  The idea is that if there were not enough
-huge pages to cover the mapping, the mmap() would fail.  This was first
-done with a simple check in the code at mmap() time to determine if there
-were enough free huge pages to cover the mapping.  Like most things in the
-kernel, the code has evolved over time.  However, the basic idea was to
-'reserve' huge pages at mmap() time to ensure that huge pages would be
-available for page faults in that mapping.  The description below attempts to
-describe how huge page reserve processing is done in the v4.10 kernel.
-
-
-Audience
-========
-This description is primarily targeted at kernel developers who are modifying
-hugetlbfs code.
-
-
-The Data Structures
-===================
-
-resv_huge_pages
-	This is a global (per-hstate) count of reserved huge pages.  Reserved
-	huge pages are only available to the task which reserved them.
-	Therefore, the number of huge pages generally available is computed
-	as (``free_huge_pages - resv_huge_pages``).
-Reserve Map
-	A reserve map is described by the structure::
-
-		struct resv_map {
-			struct kref refs;
-			spinlock_t lock;
-			struct list_head regions;
-			long adds_in_progress;
-			struct list_head region_cache;
-			long region_cache_count;
-		};
-
-	There is one reserve map for each huge page mapping in the system.
-	The regions list within the resv_map describes the regions within
-	the mapping.  A region is described as::
-
-		struct file_region {
-			struct list_head link;
-			long from;
-			long to;
-		};
-
-	The 'from' and 'to' fields of the file region structure are huge page
-	indices into the mapping.  Depending on the type of mapping, a
-	region in the reserv_map may indicate reservations exist for the
-	range, or reservations do not exist.
-Flags for MAP_PRIVATE Reservations
-	These are stored in the bottom bits of the reservation map pointer.
-
-	``#define HPAGE_RESV_OWNER    (1UL << 0)``
-		Indicates this task is the owner of the reservations
-		associated with the mapping.
-	``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
-		Indicates task originally mapping this range (and creating
-		reserves) has unmapped a page from this task (the child)
-		due to a failed COW.
-Page Flags
-	The PagePrivate page flag is used to indicate that a huge page
-	reservation must be restored when the huge page is freed.  More
-	details will be discussed in the "Freeing huge pages" section.
-
-
-Reservation Map Location (Private or Shared)
-============================================
-
-A huge page mapping or segment is either private or shared.  If private,
-it is typically only available to a single address space (task).  If shared,
-it can be mapped into multiple address spaces (tasks).  The location and
-semantics of the reservation map is significantly different for the two types
-of mappings.  Location differences are:
-
-- For private mappings, the reservation map hangs off the VMA structure.
-  Specifically, vma->vm_private_data.  This reserve map is created at the
-  time the mapping (mmap(MAP_PRIVATE)) is created.
-- For shared mappings, the reservation map hangs off the inode.  Specifically,
-  inode->i_mapping->private_data.  Since shared mappings are always backed
-  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
-  contains a reservation map.  As a result, the reservation map is allocated
-  when the inode is created.
-
-
-Creating Reservations
-=====================
-Reservations are created when a huge page backed shared memory segment is
-created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
-These operations result in a call to the routine hugetlb_reserve_pages()::
-
-	int hugetlb_reserve_pages(struct inode *inode,
-				  long from, long to,
-				  struct vm_area_struct *vma,
-				  vm_flags_t vm_flags)
-
-The first thing hugetlb_reserve_pages() does is check if the NORESERVE
-flag was specified in either the shmget() or mmap() call.  If NORESERVE
-was specified, then this routine returns immediately as no reservations
-are desired.
-
-The arguments 'from' and 'to' are huge page indices into the mapping or
-underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
-the length of the segment/mapping.  For mmap(), the offset argument could
-be used to specify the offset into the underlying file.  In such a case,
-the 'from' and 'to' arguments have been adjusted by this offset.
-
-One of the big differences between PRIVATE and SHARED mappings is the way
-in which reservations are represented in the reservation map.
-
-- For shared mappings, an entry in the reservation map indicates a reservation
-  exists or did exist for the corresponding page.  As reservations are
-  consumed, the reservation map is not modified.
-- For private mappings, the lack of an entry in the reservation map indicates
-  a reservation exists for the corresponding page.  As reservations are
-  consumed, entries are added to the reservation map.  Therefore, the
-  reservation map can also be used to determine which reservations have
-  been consumed.
-
-For private mappings, hugetlb_reserve_pages() creates the reservation map and
-hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
-to indicate this VMA owns the reservations.
-
-The reservation map is consulted to determine how many huge page reservations
-are needed for the current mapping/segment.  For private mappings, this is
-always the value (to - from).  However, for shared mappings it is possible that
-some reservations may already exist within the range (to - from).  See the
-section :ref:`Reservation Map Modifications <resv_map_modifications>`
-for details on how this is accomplished.
-
-The mapping may be associated with a subpool.  If so, the subpool is consulted
-to ensure there is sufficient space for the mapping.  It is possible that the
-subpool has set aside reservations that can be used for the mapping.  See the
-section :ref:`Subpool Reservations <sub_pool_resv>` for more details.
-
-After consulting the reservation map and subpool, the number of needed new
-reservations is known.  The routine hugetlb_acct_memory() is called to check
-for and take the requested number of reservations.  hugetlb_acct_memory()
-calls into routines that potentially allocate and adjust surplus page counts.
-However, within those routines the code is simply checking to ensure there
-are enough free huge pages to accommodate the reservation.  If there are,
-the global reservation count resv_huge_pages is adjusted something like the
-following::
-
-	if (resv_needed <= (resv_huge_pages - free_huge_pages))
-		resv_huge_pages += resv_needed;
-
-Note that the global lock hugetlb_lock is held when checking and adjusting
-these counters.
-
-If there were enough free huge pages and the global count resv_huge_pages
-was adjusted, then the reservation map associated with the mapping is
-modified to reflect the reservations.  In the case of a shared mapping, a
-file_region will exist that includes the range 'from' - 'to'.  For private
-mappings, no modifications are made to the reservation map as lack of an
-entry indicates a reservation exists.
-
-If hugetlb_reserve_pages() was successful, the global reservation count and
-reservation map associated with the mapping will be modified as required to
-ensure reservations exist for the range 'from' - 'to'.
-
-.. _consume_resv:
-
-Consuming Reservations/Allocating a Huge Page
-=============================================
-
-Reservations are consumed when huge pages associated with the reservations
-are allocated and instantiated in the corresponding mapping.  The allocation
-is performed within the routine alloc_huge_page()::
-
-	struct page *alloc_huge_page(struct vm_area_struct *vma,
-				     unsigned long addr, int avoid_reserve)
-
-alloc_huge_page is passed a VMA pointer and a virtual address, so it can
-consult the reservation map to determine if a reservation exists.  In addition,
-alloc_huge_page takes the argument avoid_reserve which indicates reserves
-should not be used even if it appears they have been set aside for the
-specified address.  The avoid_reserve argument is most often used in the case
-of Copy on Write and Page Migration where additional copies of an existing
-page are being allocated.
-
-The helper routine vma_needs_reservation() is called to determine if a
-reservation exists for the address within the mapping(vma).  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for detailed
-information on what this routine does.
-The value returned from vma_needs_reservation() is generally
-0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
-If a reservation does not exist, and there is a subpool associated with the
-mapping the subpool is consulted to determine if it contains reservations.
-If the subpool contains reservations, one can be used for this allocation.
-However, in every case the avoid_reserve argument overrides the use of
-a reservation for the allocation.  After determining whether a reservation
-exists and can be used for the allocation, the routine dequeue_huge_page_vma()
-is called.  This routine takes two arguments related to reservations:
-
-- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
-- chg, even though this argument is of type long only the values 0 or 1 are
-  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
-  reservation exists (see the section "Memory Policy and Reservations" for
-  possible issues).  If the value is 1, it indicates a reservation does not
-  exist and the page must be taken from the global free pool if possible.
-
-The free lists associated with the memory policy of the VMA are searched for
-a free page.  If a page is found, the value free_huge_pages is decremented
-when the page is removed from the free list.  If there was a reservation
-associated with the page, the following adjustments are made::
-
-	SetPagePrivate(page);	/* Indicates allocating this page consumed
-				 * a reservation, and if an error is
-				 * encountered such that the page must be
-				 * freed, the reservation will be restored. */
-	resv_huge_pages--;	/* Decrement the global reservation count */
-
-Note, if no huge page can be found that satisfies the VMA's memory policy
-an attempt will be made to allocate one using the buddy allocator.  This
-brings up the issue of surplus huge pages and overcommit which is beyond
-the scope reservations.  Even if a surplus page is allocated, the same
-reservation based adjustments as above will be made: SetPagePrivate(page) and
-resv_huge_pages--.
-
-After obtaining a new huge page, (page)->private is set to the value of
-the subpool associated with the page if it exists.  This will be used for
-subpool accounting when the page is freed.
-
-The routine vma_commit_reservation() is then called to adjust the reserve
-map based on the consumption of the reservation.  In general, this involves
-ensuring the page is represented within a file_region structure of the region
-map.  For shared mappings where the reservation was present, an entry
-in the reserve map already existed so no change is made.  However, if there
-was no reservation in a shared mapping or this was a private mapping a new
-entry must be created.
-
-It is possible that the reserve map could have been changed between the call
-to vma_needs_reservation() at the beginning of alloc_huge_page() and the
-call to vma_commit_reservation() after the page was allocated.  This would
-be possible if hugetlb_reserve_pages was called for the same page in a shared
-mapping.  In such cases, the reservation count and subpool free page count
-will be off by one.  This rare condition can be identified by comparing the
-return value from vma_needs_reservation and vma_commit_reservation.  If such
-a race is detected, the subpool and global reserve counts are adjusted to
-compensate.  See the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>` for more
-information on these routines.
-
-
-Instantiate Huge Pages
-======================
-
-After huge page allocation, the page is typically added to the page tables
-of the allocating task.  Before this, pages in a shared mapping are added
-to the page cache and pages in private mappings are added to an anonymous
-reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
-when a huge page that has been instantiated is freed no adjustment is made
-to the global reservation count (resv_huge_pages).
-
-
-Freeing Huge Pages
-==================
-
-Huge page freeing is performed by the routine free_huge_page().  This routine
-is the destructor for hugetlbfs compound pages.  As a result, it is only
-passed a pointer to the page struct.  When a huge page is freed, reservation
-accounting may need to be performed.  This would be the case if the page was
-associated with a subpool that contained reserves, or the page is being freed
-on an error path where a global reserve count must be restored.
-
-The page->private field points to any subpool associated with the page.
-If the PagePrivate flag is set, it indicates the global reserve count should
-be adjusted (see the section
-:ref:`Consuming Reservations/Allocating a Huge Page <consume_resv>`
-for information on how these are set).
-
-The routine first calls hugepage_subpool_put_pages() for the page.  If this
-routine returns a value of 0 (which does not equal the value passed 1) it
-indicates reserves are associated with the subpool, and this newly free page
-must be used to keep the number of subpool reserves above the minimum size.
-Therefore, the global resv_huge_pages counter is incremented in this case.
-
-If the PagePrivate flag was set in the page, the global resv_huge_pages counter
-will always be incremented.
-
-.. _sub_pool_resv:
-
-Subpool Reservations
-====================
-
-There is a struct hstate associated with each huge page size.  The hstate
-tracks all huge pages of the specified size.  A subpool represents a subset
-of pages within a hstate that is associated with a mounted hugetlbfs
-filesystem.
-
-When a hugetlbfs filesystem is mounted a min_size option can be specified
-which indicates the minimum number of huge pages required by the filesystem.
-If this option is specified, the number of huge pages corresponding to
-min_size are reserved for use by the filesystem.  This number is tracked in
-the min_hpages field of a struct hugepage_subpool.  At mount time,
-hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
-huge pages.  If they can not be reserved, the mount fails.
-
-The routines hugepage_subpool_get/put_pages() are called when pages are
-obtained from or released back to a subpool.  They perform all subpool
-accounting, and track any reservations associated with the subpool.
-hugepage_subpool_get/put_pages are passed the number of huge pages by which
-to adjust the subpool 'used page' count (down for get, up for put).  Normally,
-they return the same value that was passed or an error if not enough pages
-exist in the subpool.
-
-However, if reserves are associated with the subpool a return value less
-than the passed value may be returned.  This return value indicates the
-number of additional global pool adjustments which must be made.  For example,
-suppose a subpool contains 3 reserved huge pages and someone asks for 5.
-The 3 reserved pages associated with the subpool can be used to satisfy part
-of the request.  But, 2 pages must be obtained from the global pools.  To
-relay this information to the caller, the value 2 is returned.  The caller
-is then responsible for attempting to obtain the additional two pages from
-the global pools.
-
-
-COW and Reservations
-====================
-
-Since shared mappings all point to and use the same underlying pages, the
-biggest reservation concern for COW is private mappings.  In this case,
-two tasks can be pointing at the same previously allocated page.  One task
-attempts to write to the page, so a new page must be allocated so that each
-task points to its own page.
-
-When the page was originally allocated, the reservation for that page was
-consumed.  When an attempt to allocate a new page is made as a result of
-COW, it is possible that no free huge pages are free and the allocation
-will fail.
-
-When the private mapping was originally created, the owner of the mapping
-was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
-map of the owner.  Since the owner created the mapping, the owner owns all
-the reservations associated with the mapping.  Therefore, when a write fault
-occurs and there is no page available, different action is taken for the owner
-and non-owner of the reservation.
-
-In the case where the faulting task is not the owner, the fault will fail and
-the task will typically receive a SIGBUS.
-
-If the owner is the faulting task, we want it to succeed since it owned the
-original reservation.  To accomplish this, the page is unmapped from the
-non-owning task.  In this way, the only reference is from the owning task.
-In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
-of the non-owning task.  The non-owning task may receive a SIGBUS if it later
-faults on a non-present page.  But, the original owner of the
-mapping/reservation will behave as expected.
-
-
-.. _resv_map_modifications:
-
-Reservation Map Modifications
-=============================
-
-The following low level routines are used to make modifications to a
-reservation map.  Typically, these routines are not called directly.  Rather,
-a reservation map helper routine is called which calls one of these low level
-routines.  These low level routines are fairly well documented in the source
-code (mm/hugetlb.c).  These routines are::
-
-	long region_chg(struct resv_map *resv, long f, long t);
-	long region_add(struct resv_map *resv, long f, long t);
-	void region_abort(struct resv_map *resv, long f, long t);
-	long region_count(struct resv_map *resv, long f, long t);
-
-Operations on the reservation map typically involve two operations:
-
-1) region_chg() is called to examine the reserve map and determine how
-   many pages in the specified range [f, t) are NOT currently represented.
-
-   The calling code performs global checks and allocations to determine if
-   there are enough huge pages for the operation to succeed.
-
-2)
-  a) If the operation can succeed, region_add() is called to actually modify
-     the reservation map for the same range [f, t) previously passed to
-     region_chg().
-  b) If the operation can not succeed, region_abort is called for the same
-     range [f, t) to abort the operation.
-
-Note that this is a two step process where region_add() and region_abort()
-are guaranteed to succeed after a prior call to region_chg() for the same
-range.  region_chg() is responsible for pre-allocating any data structures
-necessary to ensure the subsequent operations (specifically region_add()))
-will succeed.
-
-As mentioned above, region_chg() determines the number of pages in the range
-which are NOT currently represented in the map.  This number is returned to
-the caller.  region_add() returns the number of pages in the range added to
-the map.  In most cases, the return value of region_add() is the same as the
-return value of region_chg().  However, in the case of shared mappings it is
-possible for changes to the reservation map to be made between the calls to
-region_chg() and region_add().  In this case, the return value of region_add()
-will not match the return value of region_chg().  It is likely that in such
-cases global counts and subpool accounting will be incorrect and in need of
-adjustment.  It is the responsibility of the caller to check for this condition
-and make the appropriate adjustments.
-
-The routine region_del() is called to remove regions from a reservation map.
-It is typically called in the following situations:
-
-- When a file in the hugetlbfs filesystem is being removed, the inode will
-  be released and the reservation map freed.  Before freeing the reservation
-  map, all the individual file_region structures must be freed.  In this case
-  region_del is passed the range [0, LONG_MAX).
-- When a hugetlbfs file is being truncated.  In this case, all allocated pages
-  after the new file size must be freed.  In addition, any file_region entries
-  in the reservation map past the new end of file must be deleted.  In this
-  case, region_del is passed the range [new_end_of_file, LONG_MAX).
-- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
-  are removed from the middle of the file one at a time.  As the pages are
-  removed, region_del() is called to remove the corresponding entry from the
-  reservation map.  In this case, region_del is passed the range
-  [page_idx, page_idx + 1).
-
-In every case, region_del() will return the number of pages removed from the
-reservation map.  In VERY rare cases, region_del() can fail.  This can only
-happen in the hole punch case where it has to split an existing file_region
-entry and can not allocate a new structure.  In this error case, region_del()
-will return -ENOMEM.  The problem here is that the reservation map will
-indicate that there is a reservation for the page.  However, the subpool and
-global reservation counts will not reflect the reservation.  To handle this
-situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
-counters so that they correspond with the reservation map entry that could
-not be deleted.
-
-region_count() is called when unmapping a private huge page mapping.  In
-private mappings, the lack of a entry in the reservation map indicates that
-a reservation exists.  Therefore, by counting the number of entries in the
-reservation map we know how many reservations were consumed and how many are
-outstanding (outstanding = (end - start) - region_count(resv, start, end)).
-Since the mapping is going away, the subpool and global reservation counts
-are decremented by the number of outstanding reservations.
-
-.. _resv_map_helpers:
-
-Reservation Map Helper Routines
-===============================
-
-Several helper routines exist to query and modify the reservation maps.
-These routines are only interested with reservations for a specific huge
-page, so they just pass in an address instead of a range.  In addition,
-they pass in the associated VMA.  From the VMA, the type of mapping (private
-or shared) and the location of the reservation map (inode or VMA) can be
-determined.  These routines simply call the underlying routines described
-in the section "Reservation Map Modifications".  However, they do take into
-account the 'opposite' meaning of reservation map entries for private and
-shared mappings and hide this detail from the caller::
-
-	long vma_needs_reservation(struct hstate *h,
-				   struct vm_area_struct *vma,
-				   unsigned long addr)
-
-This routine calls region_chg() for the specified page.  If no reservation
-exists, 1 is returned.  If a reservation exists, 0 is returned::
-
-	long vma_commit_reservation(struct hstate *h,
-				    struct vm_area_struct *vma,
-				    unsigned long addr)
-
-This calls region_add() for the specified page.  As in the case of region_chg
-and region_add, this routine is to be called after a previous call to
-vma_needs_reservation.  It will add a reservation entry for the page.  It
-returns 1 if the reservation was added and 0 if not.  The return value should
-be compared with the return value of the previous call to
-vma_needs_reservation.  An unexpected difference indicates the reservation
-map was modified between calls::
-
-	void vma_end_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This calls region_abort() for the specified page.  As in the case of region_chg
-and region_abort, this routine is to be called after a previous call to
-vma_needs_reservation.  It will abort/end the in progress reservation add
-operation::
-
-	long vma_add_reservation(struct hstate *h,
-				 struct vm_area_struct *vma,
-				 unsigned long addr)
-
-This is a special wrapper routine to help facilitate reservation cleanup
-on error paths.  It is only called from the routine restore_reserve_on_error().
-This routine is used in conjunction with vma_needs_reservation in an attempt
-to add a reservation to the reservation map.  It takes into account the
-different reservation map semantics for private and shared mappings.  Hence,
-region_add is called for shared mappings (as an entry present in the map
-indicates a reservation), and region_del is called for private mappings (as
-the absence of an entry in the map indicates a reservation).  See the section
-"Reservation cleanup in error paths" for more information on what needs to
-be done on error paths.
-
-
-Reservation Cleanup in Error Paths
-==================================
-
-As mentioned in the section
-:ref:`Reservation Map Helper Routines <resv_map_helpers>`, reservation
-map modifications are performed in two steps.  First vma_needs_reservation
-is called before a page is allocated.  If the allocation is successful,
-then vma_commit_reservation is called.  If not, vma_end_reservation is called.
-Global and subpool reservation counts are adjusted based on success or failure
-of the operation and all is well.
-
-Additionally, after a huge page is instantiated the PagePrivate flag is
-cleared so that accounting when the page is ultimately freed is correct.
-
-However, there are several instances where errors are encountered after a huge
-page is allocated but before it is instantiated.  In this case, the page
-allocation has consumed the reservation and made the appropriate subpool,
-reservation map and global count adjustments.  If the page is freed at this
-time (before instantiation and clearing of PagePrivate), then free_huge_page
-will increment the global reservation count.  However, the reservation map
-indicates the reservation was consumed.  This resulting inconsistent state
-will cause the 'leak' of a reserved huge page.  The global reserve count will
-be  higher than it should and prevent allocation of a pre-allocated page.
-
-The routine restore_reserve_on_error() attempts to handle this situation.  It
-is fairly well documented.  The intention of this routine is to restore
-the reservation map to the way it was before the page allocation.   In this
-way, the state of the reservation map will correspond to the global reservation
-count after the page is freed.
-
-The routine restore_reserve_on_error itself may encounter errors while
-attempting to restore the reservation map entry.  In this case, it will
-simply clear the PagePrivate flag of the page.  In this way, the global
-reserve count will not be incremented when the page is freed.  However, the
-reservation map will continue to look as though the reservation was consumed.
-A page can still be allocated for the address, but it will not use a reserved
-page as originally intended.
-
-There is some code (most notably userfaultfd) which can not call
-restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
-so that a reservation will not be leaked when the huge page is freed.
-
-
-Reservations and Memory Policy
-==============================
-Per-node huge page lists existed in struct hstate when git was first used
-to manage Linux code.  The concept of reservations was added some time later.
-When reservations were added, no attempt was made to take memory policy
-into account.  While cpusets are not exactly the same as memory policy, this
-comment in hugetlb_acct_memory sums up the interaction between reservations
-and cpusets/memory policy::
-
-	/*
-	 * When cpuset is configured, it breaks the strict hugetlb page
-	 * reservation as the accounting is done on a global variable. Such
-	 * reservation is completely rubbish in the presence of cpuset because
-	 * the reservation is not checked against page availability for the
-	 * current cpuset. Application can still potentially OOM'ed by kernel
-	 * with lack of free htlb page in cpuset that the task is in.
-	 * Attempt to enforce strict accounting with cpuset is almost
-	 * impossible (or too ugly) because cpuset is too fluid that
-	 * task or memory node can be dynamically moved between cpusets.
-	 *
-	 * The change of semantics for shared hugetlb mapping with cpuset is
-	 * undesirable. However, in order to preserve some of the semantics,
-	 * we fall back to check against current free page availability as
-	 * a best attempt and hopefully to minimize the impact of changing
-	 * semantics that cpuset has.
-	 */
-
-Huge page reservations were added to prevent unexpected page allocation
-failures (OOM) at page fault time.  However, if an application makes use
-of cpusets or memory policy there is no guarantee that huge pages will be
-available on the required nodes.  This is true even if there are a sufficient
-number of global reservations.
-
-Hugetlbfs regression testing
-============================
-
-The most complete set of hugetlb tests are in the libhugetlbfs repository.
-If you modify any hugetlb related code, use the libhugetlbfs test suite
-to check for regressions.  In addition, if you add any new hugetlb
-functionality, please add appropriate tests to libhugetlbfs.
-
---
-Mike Kravetz, 7 April 2017
diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst
deleted file mode 100644
index b9d5253c1305..000000000000
--- a/Documentation/vm/hwpoison.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-.. hwpoison:
-
-========
-hwpoison
-========
-
-What is hwpoison?
-=================
-
-Upcoming Intel CPUs have support for recovering from some memory errors
-(``MCA recovery``). This requires the OS to declare a page "poisoned",
-kill the processes associated with it and avoid using it in the future.
-
-This patchkit implements the necessary infrastructure in the VM.
-
-To quote the overview comment::
-
-	High level machine check handler. Handles pages reported by the
-	hardware as being corrupted usually due to a 2bit ECC memory or cache
-	failure.
-
-	This focusses on pages detected as corrupted in the background.
-	When the current CPU tries to consume corruption the currently
-	running process can just be killed directly instead. This implies
-	that if the error cannot be handled for some reason it's safe to
-	just ignore it because no corruption has been consumed yet. Instead
-	when that happens another machine check will happen.
-
-	Handles page cache pages in various states. The tricky part
-	here is that we can access any page asynchronous to other VM
-	users, because memory failures could happen anytime and anywhere,
-	possibly violating some of their assumptions. This is why this code
-	has to be extremely careful. Generally it tries to use normal locking
-	rules, as in get the standard locks, even if that means the
-	error handling takes potentially a long time.
-
-	Some of the operations here are somewhat inefficient and have non
-	linear algorithmic complexity, because the data structures have not
-	been optimized for this case. This is in particular the case
-	for the mapping from a vma to a process. Since this case is expected
-	to be rare we hope we can get away with this.
-
-The code consists of a the high level handler in mm/memory-failure.c,
-a new page poison bit and various checks in the VM to handle poisoned
-pages.
-
-The main target right now is KVM guests, but it works for all kinds
-of applications. KVM support requires a recent qemu-kvm release.
-
-For the KVM use there was need for a new signal type so that
-KVM can inject the machine check into the guest with the proper
-address. This in theory allows other applications to handle
-memory failures too. The expection is that near all applications
-won't do that, but some very specialized ones might.
-
-Failure recovery modes
-======================
-
-There are two (actually three) modes memory failure recovery can be in:
-
-vm.memory_failure_recovery sysctl set to zero:
-	All memory failures cause a panic. Do not attempt recovery.
-
-early kill
-	(can be controlled globally and per process)
-	Send SIGBUS to the application as soon as the error is detected
-	This allows applications who can process memory errors in a gentle
-	way (e.g. drop affected object)
-	This is the mode used by KVM qemu.
-
-late kill
-	Send SIGBUS when the application runs into the corrupted page.
-	This is best for memory error unaware applications and default
-	Note some pages are always handled as late kill.
-
-User control
-============
-
-vm.memory_failure_recovery
-	See sysctl.txt
-
-vm.memory_failure_early_kill
-	Enable early kill mode globally
-
-PR_MCE_KILL
-	Set early/late kill mode/revert to system default
-
-	arg1: PR_MCE_KILL_CLEAR:
-		Revert to system default
-	arg1: PR_MCE_KILL_SET:
-		arg2 defines thread specific mode
-
-		PR_MCE_KILL_EARLY:
-			Early kill
-		PR_MCE_KILL_LATE:
-			Late kill
-		PR_MCE_KILL_DEFAULT
-			Use system global default
-
-	Note that if you want to have a dedicated thread which handles
-	the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should
-	call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise,
-	the SIGBUS is sent to the main thread.
-
-PR_MCE_KILL_GET
-	return current mode
-
-Testing
-=======
-
-* madvise(MADV_HWPOISON, ....) (as root) - Poison a page in the
-  process for testing
-
-* hwpoison-inject module through debugfs ``/sys/kernel/debug/hwpoison/``
-
-  corrupt-pfn
-	Inject hwpoison fault at PFN echoed into this file. This does
-	some early filtering to avoid corrupted unintended pages in test suites.
-
-  unpoison-pfn
-	Software-unpoison page at PFN echoed into this file. This way
-	a page can be reused again.  This only works for Linux
-	injected failures, not for real memory failures. Once any hardware
-	memory failure happens, this feature is disabled.
-
-  Note these injection interfaces are not stable and might change between
-  kernel versions
-
-  corrupt-filter-dev-major, corrupt-filter-dev-minor
-	Only handle memory failures to pages associated with the file
-	system defined by block device major/minor.  -1U is the
-	wildcard value.  This should be only used for testing with
-	artificial injection.
-
-  corrupt-filter-memcg
-	Limit injection to pages owned by memgroup. Specified by inode
-	number of the memcg.
-
-	Example::
-
-		mkdir /sys/fs/cgroup/mem/hwpoison
-
-	        usemem -m 100 -s 1000 &
-		echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
-
-		memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
-		echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
-
-		page-types -p `pidof init`   --hwpoison  # shall do nothing
-		page-types -p `pidof usemem` --hwpoison  # poison its pages
-
-  corrupt-filter-flags-mask, corrupt-filter-flags-value
-	When specified, only poison pages if ((page_flags & mask) ==
-	value).  This allows stress testing of many kinds of
-	pages. The page_flags are the same as in /proc/kpageflags. The
-	flag bits are defined in include/linux/kernel-page-flags.h and
-	documented in Documentation/admin-guide/mm/pagemap.rst
-
-* Architecture specific MCE injector
-
-  x86 has mce-inject, mce-test
-
-  Some portable hwpoison test programs in mce-test, see below.
-
-References
-==========
-
-http://halobates.de/mce-lc09-2.pdf
-	Overview presentation from LinuxCon 09
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
-	Test suite (hwpoison specific portable tests in tsrc)
-
-git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
-	x86 specific injector
-
-
-Limitations
-===========
-- Not all page types are supported and never will. Most kernel internal
-  objects cannot be recovered, only LRU pages for now.
-
----
-Andi Kleen, Oct 2009
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
deleted file mode 100644
index 575ccd40e30c..000000000000
--- a/Documentation/vm/index.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-=====================================
-Linux Memory Management Documentation
-=====================================
-
-Memory Management Guide
-=======================
-
-This is a guide to understanding the memory management subsystem
-of Linux.  If you are looking for advice on simply allocating memory,
-see the :ref:`memory_allocation`.  For controlling and tuning guides,
-see the :doc:`admin guide <../admin-guide/mm/index>`.
-
-.. toctree::
-   :maxdepth: 1
-
-   physical_memory
-   page_tables
-   process_addrs
-   bootmem
-   page_allocation
-   vmalloc
-   slab
-   highmem
-   page_reclaim
-   swap
-   page_cache
-   shmfs
-   oom
-
-Legacy Documentation
-====================
-
-This is a collection of older documents about the Linux memory management
-(MM) subsystem internals with different level of details ranging from
-notes and mailing list responses for elaborating descriptions of data
-structures and algorithms.  It should all be integrated nicely into the
-above structured documentation, or deleted if it has served its purpose.
-
-.. toctree::
-   :maxdepth: 1
-
-   active_mm
-   arch_pgtable_helpers
-   balance
-   damon/index
-   free_page_reporting
-   frontswap
-   hmm
-   hwpoison
-   hugetlbfs_reserv
-   ksm
-   memory-model
-   mmu_notifier
-   numa
-   overcommit-accounting
-   page_migration
-   page_frags
-   page_owner
-   page_table_check
-   remap_file_pages
-   slub
-   split_page_table_lock
-   transhuge
-   unevictable-lru
-   vmalloced-kernel-stacks
-   vmemmap_dedup
-   z3fold
-   zsmalloc
diff --git a/Documentation/vm/ksm.rst b/Documentation/vm/ksm.rst
deleted file mode 100644
index 9e37add068e6..000000000000
--- a/Documentation/vm/ksm.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. _ksm:
-
-=======================
-Kernel Samepage Merging
-=======================
-
-KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
-added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
-and http://lwn.net/Articles/306704/ and https://lwn.net/Articles/330589/
-
-The userspace interface of KSM is described in :ref:`Documentation/admin-guide/mm/ksm.rst <admin_guide_ksm>`
-
-Design
-======
-
-Overview
---------
-
-.. kernel-doc:: mm/ksm.c
-   :DOC: Overview
-
-Reverse mapping
----------------
-KSM maintains reverse mapping information for KSM pages in the stable
-tree.
-
-If a KSM page is shared between less than ``max_page_sharing`` VMAs,
-the node of the stable tree that represents such KSM page points to a
-list of struct rmap_item and the ``page->mapping`` of the
-KSM page points to the stable tree node.
-
-When the sharing passes this threshold, KSM adds a second dimension to
-the stable tree. The tree node becomes a "chain" that links one or
-more "dups". Each "dup" keeps reverse mapping information for a KSM
-page with ``page->mapping`` pointing to that "dup".
-
-Every "chain" and all "dups" linked into a "chain" enforce the
-invariant that they represent the same write protected memory content,
-even if each "dup" will be pointed by a different KSM page copy of
-that content.
-
-This way the stable tree lookup computational complexity is unaffected
-if compared to an unlimited list of reverse mappings. It is still
-enforced that there cannot be KSM page content duplicates in the
-stable tree itself.
-
-The deduplication limit enforced by ``max_page_sharing`` is required
-to avoid the virtual memory rmap lists to grow too large. The rmap
-walk has O(N) complexity where N is the number of rmap_items
-(i.e. virtual mappings) that are sharing the page, which is in turn
-capped by ``max_page_sharing``. So this effectively spreads the linear
-O(N) computational complexity from rmap walk context over different
-KSM pages. The ksmd walk over the stable_node "chains" is also O(N),
-but N is the number of stable_node "dups", not the number of
-rmap_items, so it has not a significant impact on ksmd performance. In
-practice the best stable_node "dup" candidate will be kept and found
-at the head of the "dups" list.
-
-High values of ``max_page_sharing`` result in faster memory merging
-(because there will be fewer stable_node dups queued into the
-stable_node chain->hlist to check for pruning) and higher
-deduplication factor at the expense of slower worst case for rmap
-walks for any KSM page which can happen during swapping, compaction,
-NUMA balancing and page migration.
-
-The ``stable_node_dups/stable_node_chains`` ratio is also affected by the
-``max_page_sharing`` tunable, and an high ratio may indicate fragmentation
-in the stable_node dups, which could be solved by introducing
-fragmentation algorithms in ksmd which would refile rmap_items from
-one stable_node dup to another stable_node dup, in order to free up
-stable_node "dups" with few rmap_items in them, but that may increase
-the ksmd CPU usage and possibly slowdown the readonly computations on
-the KSM pages of the applications.
-
-The whole list of stable_node "dups" linked in the stable_node
-"chains" is scanned periodically in order to prune stale stable_nodes.
-The frequency of such scans is defined by
-``stable_node_chains_prune_millisecs`` sysfs tunable.
-
-Reference
----------
-.. kernel-doc:: mm/ksm.c
-   :functions: mm_slot ksm_scan stable_node rmap_item
-
---
-Izik Eidus,
-Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
deleted file mode 100644
index 30e8fbed6914..000000000000
--- a/Documentation/vm/memory-model.rst
+++ /dev/null
@@ -1,177 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _physical_memory_model:
-
-=====================
-Physical Memory Model
-=====================
-
-Physical memory in a system may be addressed in different ways. The
-simplest case is when the physical memory starts at address 0 and
-spans a contiguous range up to the maximal address. It could be,
-however, that this range contains small holes that are not accessible
-for the CPU. Then there could be several contiguous ranges at
-completely distinct addresses. And, don't forget about NUMA, where
-different memory banks are attached to different CPUs.
-
-Linux abstracts this diversity using one of the two memory models:
-FLATMEM and SPARSEMEM. Each architecture defines what
-memory models it supports, what the default memory model is and
-whether it is possible to manually override that default.
-
-All the memory models track the status of physical page frames using
-struct page arranged in one or more arrays.
-
-Regardless of the selected memory model, there exists one-to-one
-mapping between the physical page frame number (PFN) and the
-corresponding `struct page`.
-
-Each memory model defines :c:func:`pfn_to_page` and :c:func:`page_to_pfn`
-helpers that allow the conversion from PFN to `struct page` and vice
-versa.
-
-FLATMEM
-=======
-
-The simplest memory model is FLATMEM. This model is suitable for
-non-NUMA systems with contiguous, or mostly contiguous, physical
-memory.
-
-In the FLATMEM memory model, there is a global `mem_map` array that
-maps the entire physical memory. For most architectures, the holes
-have entries in the `mem_map` array. The `struct page` objects
-corresponding to the holes are never fully initialized.
-
-To allocate the `mem_map` array, architecture specific setup code should
-call :c:func:`free_area_init` function. Yet, the mappings array is not
-usable until the call to :c:func:`memblock_free_all` that hands all the
-memory to the page allocator.
-
-An architecture may free parts of the `mem_map` array that do not cover the
-actual physical pages. In such case, the architecture specific
-:c:func:`pfn_valid` implementation should take the holes in the
-`mem_map` into account.
-
-With FLATMEM, the conversion between a PFN and the `struct page` is
-straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
-`mem_map` array.
-
-The `ARCH_PFN_OFFSET` defines the first page frame number for
-systems with physical memory starting at address different from 0.
-
-SPARSEMEM
-=========
-
-SPARSEMEM is the most versatile memory model available in Linux and it
-is the only memory model that supports several advanced features such
-as hot-plug and hot-remove of the physical memory, alternative memory
-maps for non-volatile memory devices and deferred initialization of
-the memory map for larger systems.
-
-The SPARSEMEM model presents the physical memory as a collection of
-sections. A section is represented with struct mem_section
-that contains `section_mem_map` that is, logically, a pointer to an
-array of struct pages. However, it is stored with some other magic
-that aids the sections management. The section size and maximal number
-of section is specified using `SECTION_SIZE_BITS` and
-`MAX_PHYSMEM_BITS` constants defined by each architecture that
-supports SPARSEMEM. While `MAX_PHYSMEM_BITS` is an actual width of a
-physical address that an architecture supports, the
-`SECTION_SIZE_BITS` is an arbitrary value.
-
-The maximal number of sections is denoted `NR_MEM_SECTIONS` and
-defined as
-
-.. math::
-
-   NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
-
-The `mem_section` objects are arranged in a two-dimensional array
-called `mem_sections`. The size and placement of this array depend
-on `CONFIG_SPARSEMEM_EXTREME` and the maximal possible number of
-sections:
-
-* When `CONFIG_SPARSEMEM_EXTREME` is disabled, the `mem_sections`
-  array is static and has `NR_MEM_SECTIONS` rows. Each row holds a
-  single `mem_section` object.
-* When `CONFIG_SPARSEMEM_EXTREME` is enabled, the `mem_sections`
-  array is dynamically allocated. Each row contains PAGE_SIZE worth of
-  `mem_section` objects and the number of rows is calculated to fit
-  all the memory sections.
-
-The architecture setup code should call sparse_init() to
-initialize the memory sections and the memory maps.
-
-With SPARSEMEM there are two possible ways to convert a PFN to the
-corresponding `struct page` - a "classic sparse" and "sparse
-vmemmap". The selection is made at build time and it is determined by
-the value of `CONFIG_SPARSEMEM_VMEMMAP`.
-
-The classic sparse encodes the section number of a page in page->flags
-and uses high bits of a PFN to access the section that maps that page
-frame. Inside a section, the PFN is the index to the array of pages.
-
-The sparse vmemmap uses a virtually mapped memory map to optimize
-pfn_to_page and page_to_pfn operations. There is a global `struct
-page *vmemmap` pointer that points to a virtually contiguous array of
-`struct page` objects. A PFN is an index to that array and the
-offset of the `struct page` from `vmemmap` is the PFN of that
-page.
-
-To use vmemmap, an architecture has to reserve a range of virtual
-addresses that will map the physical pages containing the memory
-map and make sure that `vmemmap` points to that range. In addition,
-the architecture should implement :c:func:`vmemmap_populate` method
-that will allocate the physical memory and create page tables for the
-virtual memory map. If an architecture does not have any special
-requirements for the vmemmap mappings, it can use default
-:c:func:`vmemmap_populate_basepages` provided by the generic memory
-management.
-
-The virtually mapped memory map allows storing `struct page` objects
-for persistent memory devices in pre-allocated storage on those
-devices. This storage is represented with struct vmem_altmap
-that is eventually passed to vmemmap_populate() through a long chain
-of function calls. The vmemmap_populate() implementation may use the
-`vmem_altmap` along with :c:func:`vmemmap_alloc_block_buf` helper to
-allocate memory map on the persistent memory device.
-
-ZONE_DEVICE
-===========
-The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer
-`struct page` `mem_map` services for device driver identified physical
-address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact
-that the page objects for these address ranges are never marked online,
-and that a reference must be taken against the device, not just the page
-to keep the memory pinned for active use. `ZONE_DEVICE`, via
-:c:func:`devm_memremap_pages`, performs just enough memory hotplug to
-turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and
-:c:func:`get_user_pages` service for the given range of pfns. Since the
-page reference count never drops below 1 the page is never tracked as
-free memory and the page's `struct list_head lru` space is repurposed
-for back referencing to the host device / driver that mapped the memory.
-
-While `SPARSEMEM` presents memory as a collection of sections,
-optionally collected into memory blocks, `ZONE_DEVICE` users have a need
-for smaller granularity of populating the `mem_map`. Given that
-`ZONE_DEVICE` memory is never marked online it is subsequently never
-subject to its memory ranges being exposed through the sysfs memory
-hotplug api on memory block boundaries. The implementation relies on
-this lack of user-api constraint to allow sub-section sized memory
-ranges to be specified to :c:func:`arch_add_memory`, the top-half of
-memory hotplug. Sub-section support allows for 2MB as the cross-arch
-common alignment granularity for :c:func:`devm_memremap_pages`.
-
-The users of `ZONE_DEVICE` are:
-
-* pmem: Map platform persistent memory to be used as a direct-I/O target
-  via DAX mappings.
-
-* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
-  event callbacks to allow a device-driver to coordinate memory management
-  events related to device-memory, typically GPU memory. See
-  Documentation/vm/hmm.rst.
-
-* p2pdma: Create `struct page` objects to allow peer devices in a
-  PCI/-E topology to coordinate direct-DMA operations between themselves,
-  i.e. bypass host memory.
diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst
deleted file mode 100644
index df5d7777fc6b..000000000000
--- a/Documentation/vm/mmu_notifier.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-.. _mmu_notifier:
-
-When do you need to notify inside page table lock ?
-===================================================
-
-When clearing a pte/pmd we are given a choice to notify the event through
-(notify version of \*_clear_flush call mmu_notifier_invalidate_range) under
-the page table lock. But that notification is not necessary in all cases.
-
-For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
-thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
-process virtual address space). There is only 2 cases when you need to notify
-those secondary TLB while holding page table lock when clearing a pte/pmd:
-
-  A) page backing address is free before mmu_notifier_invalidate_range_end()
-  B) a page table entry is updated to point to a new page (COW, write fault
-     on zero page, __replace_page(), ...)
-
-Case A is obvious you do not want to take the risk for the device to write to
-a page that might now be used by some completely different task.
-
-Case B is more subtle. For correctness it requires the following sequence to
-happen:
-
-  - take page table lock
-  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
-  - set page table entry to point to new page
-
-If clearing the page table entry is not followed by a notify before setting
-the new pte/pmd value then you can break memory model like C11 or C++11 for
-the device.
-
-Consider the following scenario (device use a feature similar to ATS/PASID):
-
-Two address addrA and addrB such that \|addrA - addrB\| >= PAGE_SIZE we assume
-they are write protected for COW (other case of B apply too).
-
-::
-
- [Time N] --------------------------------------------------------------------
- CPU-thread-0  {try to write to addrA}
- CPU-thread-1  {try to write to addrB}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA and populate device TLB}
- DEV-thread-2  {read addrB and populate device TLB}
- [Time N+1] ------------------------------------------------------------------
- CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
- CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+2] ------------------------------------------------------------------
- CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
- CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {write to addrA which is a write to new page}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+3] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {preempted}
- CPU-thread-2  {}
- CPU-thread-3  {write to addrB which is a write to new page}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+4] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {}
- DEV-thread-2  {}
- [Time N+5] ------------------------------------------------------------------
- CPU-thread-0  {preempted}
- CPU-thread-1  {}
- CPU-thread-2  {}
- CPU-thread-3  {}
- DEV-thread-0  {read addrA from old page}
- DEV-thread-2  {read addrB from new page}
-
-So here because at time N+2 the clear page table entry was not pair with a
-notification to invalidate the secondary TLB, the device see the new value for
-addrB before seeing the new value for addrA. This break total memory ordering
-for the device.
-
-When changing a pte to write protect or to point to a new write protected page
-with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
-call to mmu_notifier_invalidate_range_end() outside the page table lock. This
-is true even if the thread doing the page table update is preempted right after
-releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
deleted file mode 100644
index 99fdeca917ca..000000000000
--- a/Documentation/vm/numa.rst
+++ /dev/null
@@ -1,150 +0,0 @@
-.. _numa:
-
-Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com>
-
-=============
-What is NUMA?
-=============
-
-This question can be answered from a couple of perspectives:  the
-hardware view and the Linux software view.
-
-From the hardware perspective, a NUMA system is a computer platform that
-comprises multiple components or assemblies each of which may contain 0
-or more CPUs, local memory, and/or IO buses.  For brevity and to
-disambiguate the hardware view of these physical components/assemblies
-from the software abstraction thereof, we'll call the components/assemblies
-'cells' in this document.
-
-Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset
-of the system--although some components necessary for a stand-alone SMP system
-may not be populated on any given cell.   The cells of the NUMA system are
-connected together with some sort of system interconnect--e.g., a crossbar or
-point-to-point link are common types of NUMA system interconnects.  Both of
-these types of interconnects can be aggregated to create NUMA platforms with
-cells at multiple distances from other cells.
-
-For Linux, the NUMA platforms of interest are primarily what is known as Cache
-Coherent NUMA or ccNUMA systems.   With ccNUMA systems, all memory is visible
-to and accessible from any CPU attached to any cell and cache coherency
-is handled in hardware by the processor caches and/or the system interconnect.
-
-Memory access time and effective memory bandwidth varies depending on how far
-away the cell containing the CPU or IO bus making the memory access is from the
-cell containing the target memory.  For example, access to memory by CPUs
-attached to the same cell will experience faster access times and higher
-bandwidths than accesses to memory on other, remote cells.  NUMA platforms
-can have cells at multiple remote distances from any given cell.
-
-Platform vendors don't build NUMA systems just to make software developers'
-lives interesting.  Rather, this architecture is a means to provide scalable
-memory bandwidth.  However, to achieve scalable memory bandwidth, system and
-application software must arrange for a large majority of the memory references
-[cache misses] to be to "local" memory--memory on the same cell, if any--or
-to the closest cell with memory.
-
-This leads to the Linux software view of a NUMA system:
-
-Linux divides the system's hardware resources into multiple software
-abstractions called "nodes".  Linux maps the nodes onto the physical cells
-of the hardware platform, abstracting away some of the details for some
-architectures.  As with physical cells, software nodes may contain 0 or more
-CPUs, memory and/or IO buses.  And, again, memory accesses to memory on
-"closer" nodes--nodes that map to closer cells--will generally experience
-faster access times and higher effective bandwidth than accesses to more
-remote cells.
-
-For some architectures, such as x86, Linux will "hide" any node representing a
-physical cell that has no memory attached, and reassign any CPUs attached to
-that cell to a node representing a cell that does have memory.  Thus, on
-these architectures, one cannot assume that all CPUs that Linux associates with
-a given node will see the same local memory access times and bandwidth.
-
-In addition, for some architectures, again x86 is an example, Linux supports
-the emulation of additional nodes.  For NUMA emulation, linux will carve up
-the existing nodes--or the system memory for non-NUMA platforms--into multiple
-nodes.  Each emulated node will manage a fraction of the underlying cells'
-physical memory.  NUMA emluation is useful for testing NUMA kernel and
-application features on non-NUMA platforms, and as a sort of memory resource
-management mechanism when used together with cpusets.
-[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-For each node with memory, Linux constructs an independent memory management
-subsystem, complete with its own free page lists, in-use page lists, usage
-statistics and locks to mediate access.  In addition, Linux constructs for
-each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE],
-an ordered "zonelist".  A zonelist specifies the zones/nodes to visit when a
-selected zone/node cannot satisfy the allocation request.  This situation,
-when a zone has no available memory to satisfy a request, is called
-"overflow" or "fallback".
-
-Because some nodes contain multiple zones containing different types of
-memory, Linux must decide whether to order the zonelists such that allocations
-fall back to the same zone type on a different node, or to a different zone
-type on the same node.  This is an important consideration because some zones,
-such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default Node ordered zonelist. This means it tries to fallback to other zones
-from the same node before using remote nodes which are ordered by NUMA distance.
-
-By default, Linux will attempt to satisfy memory allocation requests from the
-node to which the CPU that executes the request is assigned.  Specifically,
-Linux will attempt to allocate from the first node in the appropriate zonelist
-for the node where the request originates.  This is called "local allocation."
-If the "local" node cannot satisfy the request, the kernel will examine other
-nodes' zones in the selected zonelist looking for the first zone in the list
-that can satisfy the request.
-
-Local allocation will tend to keep subsequent access to the allocated memory
-"local" to the underlying physical resources and off the system interconnect--
-as long as the task on whose behalf the kernel allocated some memory does not
-later migrate away from that memory.  The Linux scheduler is aware of the
-NUMA topology of the platform--embodied in the "scheduling domains" data
-structures [see Documentation/scheduler/sched-domains.rst]--and the scheduler
-attempts to minimize task migration to distant scheduling domains.  However,
-the scheduler does not take a task's NUMA footprint into account directly.
-Thus, under sufficient imbalance, tasks can migrate between nodes, remote
-from their initial node and kernel data structures.
-
-System administrators and application designers can restrict a task's migration
-to improve NUMA locality using various CPU affinity command line interfaces,
-such as taskset(1) and numactl(1), and program interfaces such as
-sched_setaffinity(2).  Further, one can modify the kernel's default local
-allocation behavior using Linux NUMA memory policy. [see
-:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
-
-System administrators can restrict the CPUs and nodes' memories that a non-
-privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
-
-On architectures that do not hide memoryless nodes, Linux will include only
-zones [nodes] with memory in the zonelists.  This means that for a memoryless
-node the "local memory node"--the node of the first zone in CPU's node's
-zonelist--will not be the node itself.  Rather, it will be the node that the
-kernel selected as the nearest node with memory when it built the zonelists.
-So, default, local allocations will succeed with the kernel supplying the
-closest available memory.  This is a consequence of the same mechanism that
-allows such allocations to fallback to other nearby nodes when a node that
-does contain memory overflows.
-
-Some kernel allocations do not want or cannot tolerate this allocation fallback
-behavior.  Rather they want to be sure they get memory from the specified node
-or get notified that the node has no free memory.  This is usually the case when
-a subsystem allocates per CPU memory resources, for example.
-
-A typical model for making such an allocation is to obtain the node id of the
-node to which the "current CPU" is attached using one of the kernel's
-numa_node_id() or CPU_to_node() functions and then request memory from only
-the node id returned.  When such an allocation fails, the requesting subsystem
-may revert to its own fallback path.  The slab kernel memory allocator is an
-example of this.  Or, the subsystem may choose to disable or not to enable
-itself on allocation failure.  The kernel profiling subsystem is an example of
-this.
-
-If the architecture supports--does not hide--memoryless nodes, then CPUs
-attached to memoryless nodes would always incur the fallback path overhead
-or some subsystems would fail to initialize if they attempted to allocated
-memory exclusively from a node without memory.  To support such
-architectures transparently, kernel subsystems can use the numa_mem_id()
-or cpu_to_mem() function to locate the "local memory node" for the calling or
-specified CPU.  Again, this is the same node from which default, local page
-allocations will be attempted.
diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst
deleted file mode 100644
index 18e9e40c1ec1..000000000000
--- a/Documentation/vm/oom.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================
-Out Of Memory Handling
-======================
diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/vm/overcommit-accounting.rst
deleted file mode 100644
index 1addb0c374a4..000000000000
--- a/Documentation/vm/overcommit-accounting.rst
+++ /dev/null
@@ -1,88 +0,0 @@
-.. _overcommit_accounting:
-
-=====================
-Overcommit Accounting
-=====================
-
-The Linux kernel supports the following overcommit handling modes
-
-0
-	Heuristic overcommit handling. Obvious overcommits of address
-	space are refused. Used for a typical system. It ensures a
-	seriously wild allocation fails while allowing overcommit to
-	reduce swap usage.  root is allowed to allocate slightly more
-	memory in this mode. This is the default.
-
-1
-	Always overcommit. Appropriate for some scientific
-	applications. Classic example is code using sparse arrays and
-	just relying on the virtual memory consisting almost entirely
-	of zero pages.
-
-2
-	Don't overcommit. The total address space commit for the
-	system is not permitted to exceed swap + a configurable amount
-	(default is 50%) of physical RAM.  Depending on the amount you
-	use, in most situations this means a process will not be
-	killed while accessing pages but will receive errors on memory
-	allocation as appropriate.
-
-	Useful for applications that want to guarantee their memory
-	allocations will be available in the future without having to
-	initialize every page.
-
-The overcommit policy is set via the sysctl ``vm.overcommit_memory``.
-
-The overcommit amount can be set via ``vm.overcommit_ratio`` (percentage)
-or ``vm.overcommit_kbytes`` (absolute value). These only have an effect
-when ``vm.overcommit_memory`` is set to 2.
-
-The current overcommit limit and amount committed are viewable in
-``/proc/meminfo`` as CommitLimit and Committed_AS respectively.
-
-Gotchas
-=======
-
-The C language stack growth does an implicit mremap. If you want absolute
-guarantees and run close to the edge you MUST mmap your stack for the
-largest size you think you will need. For typical stack usage this does
-not matter much but it's a corner case if you really really care
-
-In mode 2 the MAP_NORESERVE flag is ignored.
-
-
-How It Works
-============
-
-The overcommit is based on the following rules
-
-For a file backed map
-	| SHARED or READ-only	-	0 cost (the file is the map not swap)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-For an anonymous or ``/dev/zero`` map
-	| SHARED			-	size of mapping
-	| PRIVATE READ-only	-	0 cost (but of little use)
-	| PRIVATE WRITABLE	-	size of mapping per instance
-
-Additional accounting
-	| Pages made writable copies by mmap
-	| shmfs memory drawn from the same pool
-
-Status
-======
-
-*	We account mmap memory mappings
-*	We account mprotect changes in commit
-*	We account mremap changes in size
-*	We account brk
-*	We account munmap
-*	We report the commit status in /proc
-*	Account and check on fork
-*	Review stack handling/building on exec
-*	SHMfs accounting
-*	Implement actual limit enforcement
-
-To Do
-=====
-*	Account ptrace pages (this is hard)
diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst
deleted file mode 100644
index d9b4495561f1..000000000000
--- a/Documentation/vm/page_allocation.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Page Allocation
-===============
diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst
deleted file mode 100644
index 75eba7c431b2..000000000000
--- a/Documentation/vm/page_cache.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========
-Page Cache
-==========
diff --git a/Documentation/vm/page_frags.rst b/Documentation/vm/page_frags.rst
deleted file mode 100644
index 7d6f9385d129..000000000000
--- a/Documentation/vm/page_frags.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-.. _page_frags:
-
-==============
-Page fragments
-==============
-
-A page fragment is an arbitrary-length arbitrary-offset area of memory
-which resides within a 0 or higher order compound page.  Multiple
-fragments within that page are individually refcounted, in the page's
-reference counter.
-
-The page_frag functions, page_frag_alloc and page_frag_free, provide a
-simple allocation framework for page fragments.  This is used by the
-network stack and network device drivers to provide a backing region of
-memory for use as either an sk_buff->head, or to be used in the "frags"
-portion of skb_shared_info.
-
-In order to make use of the page fragment APIs a backing page fragment
-cache is needed.  This provides a central point for the fragment allocation
-and tracks allows multiple calls to make use of a cached page.  The
-advantage to doing this is that multiple calls to get_page can be avoided
-which can be expensive at allocation time.  However due to the nature of
-this caching it is required that any calls to the cache be protected by
-either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
-to be disabled when executing the fragment allocation.
-
-The network stack uses two separate caches per CPU to handle fragment
-allocation.  The netdev_alloc_cache is used by callers making use of the
-netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
-used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
-main difference between these two calls is the context in which they may be
-called.  The "netdev" prefixed functions are usable in any context as these
-functions will disable interrupts, while the "napi" prefixed functions are
-only usable within the softirq context.
-
-Many network device drivers use a similar methodology for allocating page
-fragments, but the page fragments are cached at the ring or descriptor
-level.  In order to enable these cases it is necessary to provide a generic
-way of tearing down a page cache.  For this reason __page_frag_cache_drain
-was implemented.  It allows for freeing multiple references from a single
-page via a single call.  The advantage to doing this is that it allows for
-cleaning up the multiple references that were added to a page in order to
-avoid calling get_page per allocation.
-
-Alexander Duyck, Nov 29, 2016.
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
deleted file mode 100644
index 8c5cb8147e55..000000000000
--- a/Documentation/vm/page_migration.rst
+++ /dev/null
@@ -1,288 +0,0 @@
-.. _page_migration:
-
-==============
-Page migration
-==============
-
-Page migration allows moving the physical location of pages between
-nodes in a NUMA system while the process is running. This means that the
-virtual addresses that the process sees do not change. However, the
-system rearranges the physical location of those pages.
-
-Also see :ref:`Heterogeneous Memory Management (HMM) <hmm>`
-for migrating pages to or from device private memory.
-
-The main intent of page migration is to reduce the latency of memory accesses
-by moving pages near to the processor where the process accessing that memory
-is running.
-
-Page migration allows a process to manually relocate the node on which its
-pages are located through the MF_MOVE and MF_MOVE_ALL options while setting
-a new memory policy via mbind(). The pages of a process can also be relocated
-from another process using the sys_migrate_pages() function call. The
-migrate_pages() function call takes two sets of nodes and moves pages of a
-process that are located on the from nodes to the destination nodes.
-Page migration functions are provided by the numactl package by Andi Kleen
-(a version later than 0.9.3 is required. Get it from
-https://github.com/numactl/numactl.git). numactl provides libnuma
-which provides an interface similar to other NUMA functionality for page
-migration.  cat ``/proc/<pid>/numa_maps`` allows an easy review of where the
-pages of a process are located. See also the numa_maps documentation in the
-proc(5) man page.
-
-Manual migration is useful if for example the scheduler has relocated
-a process to a processor on a distant node. A batch scheduler or an
-administrator may detect the situation and move the pages of the process
-nearer to the new processor. The kernel itself only provides
-manual page migration support. Automatic page migration may be implemented
-through user space processes that move pages. A special function call
-"move_pages" allows the moving of individual pages within a process.
-For example, A NUMA profiler may obtain a log showing frequent off-node
-accesses and may use the result to move pages to more advantageous
-locations.
-
-Larger installations usually partition the system using cpusets into
-sections of nodes. Paul Jackson has equipped cpusets with the ability to
-move pages when a task is moved to another cpuset (See
-:ref:`CPUSETS <cpusets>`).
-Cpusets allow the automation of process locality. If a task is moved to
-a new cpuset then also all its pages are moved with it so that the
-performance of the process does not sink dramatically. Also the pages
-of processes in a cpuset are moved if the allowed memory nodes of a
-cpuset are changed.
-
-Page migration allows the preservation of the relative location of pages
-within a group of nodes for all migration techniques which will preserve a
-particular memory allocation pattern generated even after migrating a
-process. This is necessary in order to preserve the memory latencies.
-Processes will run with similar performance after migration.
-
-Page migration occurs in several steps. First a high level
-description for those trying to use migrate_pages() from the kernel
-(for userspace usage see the Andi Kleen's numactl package mentioned above)
-and then a low level description of how the low level details work.
-
-In kernel use of migrate_pages()
-================================
-
-1. Remove pages from the LRU.
-
-   Lists of pages to be migrated are generated by scanning over
-   pages and moving them into lists. This is done by
-   calling isolate_lru_page().
-   Calling isolate_lru_page() increases the references to the page
-   so that it cannot vanish while the page migration occurs.
-   It also prevents the swapper or other scans from encountering
-   the page.
-
-2. We need to have a function of type new_page_t that can be
-   passed to migrate_pages(). This function should figure out
-   how to allocate the correct new page given the old page.
-
-3. The migrate_pages() function is called which attempts
-   to do the migration. It will call the function to allocate
-   the new page for each page that is considered for
-   moving.
-
-How migrate_pages() works
-=========================
-
-migrate_pages() does several passes over its list of pages. A page is moved
-if all references to a page are removable at the time. The page has
-already been removed from the LRU via isolate_lru_page() and the refcount
-is increased so that the page cannot be freed while page migration occurs.
-
-Steps:
-
-1. Lock the page to be migrated.
-
-2. Ensure that writeback is complete.
-
-3. Lock the new page that we want to move to. It is locked so that accesses to
-   this (not yet up-to-date) page immediately block while the move is in progress.
-
-4. All the page table references to the page are converted to migration
-   entries. This decreases the mapcount of a page. If the resulting
-   mapcount is not zero then we do not migrate the page. All user space
-   processes that attempt to access the page will now wait on the page lock
-   or wait for the migration page table entry to be removed.
-
-5. The i_pages lock is taken. This will cause all processes trying
-   to access the page via the mapping to block on the spinlock.
-
-6. The refcount of the page is examined and we back out if references remain.
-   Otherwise, we know that we are the only one referencing this page.
-
-7. The radix tree is checked and if it does not contain the pointer to this
-   page then we back out because someone else modified the radix tree.
-
-8. The new page is prepped with some settings from the old page so that
-   accesses to the new page will discover a page with the correct settings.
-
-9. The radix tree is changed to point to the new page.
-
-10. The reference count of the old page is dropped because the address space
-    reference is gone. A reference to the new page is established because
-    the new page is referenced by the address space.
-
-11. The i_pages lock is dropped. With that lookups in the mapping
-    become possible again. Processes will move from spinning on the lock
-    to sleeping on the locked new page.
-
-12. The page contents are copied to the new page.
-
-13. The remaining page flags are copied to the new page.
-
-14. The old page flags are cleared to indicate that the page does
-    not provide any information anymore.
-
-15. Queued up writeback on the new page is triggered.
-
-16. If migration entries were inserted into the page table, then replace them
-    with real ptes. Doing so will enable access for user space processes not
-    already waiting for the page lock.
-
-17. The page locks are dropped from the old and new page.
-    Processes waiting on the page lock will redo their page faults
-    and will reach the new page.
-
-18. The new page is moved to the LRU and can be scanned by the swapper,
-    etc. again.
-
-Non-LRU page migration
-======================
-
-Although migration originally aimed for reducing the latency of memory accesses
-for NUMA, compaction also uses migration to create high-order pages.
-
-Current problem of the implementation is that it is designed to migrate only
-*LRU* pages. However, there are potential non-LRU pages which can be migrated
-in drivers, for example, zsmalloc, virtio-balloon pages.
-
-For virtio-balloon pages, some parts of migration code path have been hooked
-up and added virtio-balloon specific functions to intercept migration logics.
-It's too specific to a driver so other drivers who want to make their pages
-movable would have to add their own specific hooks in the migration path.
-
-To overcome the problem, VM supports non-LRU page migration which provides
-generic functions for non-LRU movable pages without driver specific hooks
-in the migration path.
-
-If a driver wants to make its pages movable, it should define three functions
-which are function pointers of struct address_space_operations.
-
-1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);``
-
-   What VM expects from isolate_page() function of driver is to return *true*
-   if driver isolates the page successfully. On returning true, VM marks the page
-   as PG_isolated so concurrent isolation in several CPUs skip the page
-   for isolation. If a driver cannot isolate the page, it should return *false*.
-
-   Once page is successfully isolated, VM uses page.lru fields so driver
-   shouldn't expect to preserve values in those fields.
-
-2. ``int (*migratepage) (struct address_space *mapping,``
-|	``struct page *newpage, struct page *oldpage, enum migrate_mode);``
-
-   After isolation, VM calls migratepage() of driver with the isolated page.
-   The function of migratepage() is to move the contents of the old page to the
-   new page
-   and set up fields of struct page newpage. Keep in mind that you should
-   indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
-   under page_lock if you migrated the oldpage successfully and returned
-   MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
-   can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
-   because VM interprets -EAGAIN as "temporary migration failure". On returning
-   any error except -EAGAIN, VM will give up the page migration without
-   retrying.
-
-   Driver shouldn't touch the page.lru field while in the migratepage() function.
-
-3. ``void (*putback_page)(struct page *);``
-
-   If migration fails on the isolated page, VM should return the isolated page
-   to the driver so VM calls the driver's putback_page() with the isolated page.
-   In this function, the driver should put the isolated page back into its own data
-   structure.
-
-Non-LRU movable page flags
-
-   There are two page flags for supporting non-LRU movable page.
-
-   * PG_movable
-
-     Driver should use the function below to make page movable under page_lock::
-
-	void __SetPageMovable(struct page *page, struct address_space *mapping)
-
-     It needs argument of address_space for registering migration
-     family functions which will be called by VM. Exactly speaking,
-     PG_movable is not a real flag of struct page. Rather, VM
-     reuses the page->mapping's lower bits to represent it::
-
-	#define PAGE_MAPPING_MOVABLE 0x2
-	page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
-
-     so driver shouldn't access page->mapping directly. Instead, driver should
-     use page_mapping() which masks off the low two bits of page->mapping under
-     page lock so it can get the right struct address_space.
-
-     For testing of non-LRU movable pages, VM supports __PageMovable() function.
-     However, it doesn't guarantee to identify non-LRU movable pages because
-     the page->mapping field is unified with other variables in struct page.
-     If the driver releases the page after isolation by VM, page->mapping
-     doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set
-     (look at __ClearPageMovable). But __PageMovable() is cheap to call whether
-     page is LRU or non-LRU movable once the page has been isolated because LRU
-     pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also
-     good for just peeking to test non-LRU movable pages before more expensive
-     checking with lock_page() in pfn scanning to select a victim.
-
-     For guaranteeing non-LRU movable page, VM provides PageMovable() function.
-     Unlike __PageMovable(), PageMovable() validates page->mapping and
-     mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents
-     sudden destroying of page->mapping.
-
-     Drivers using __SetPageMovable() should clear the flag via
-     __ClearMovablePage() under page_lock() before the releasing the page.
-
-   * PG_isolated
-
-     To prevent concurrent isolation among several CPUs, VM marks isolated page
-     as PG_isolated under lock_page(). So if a CPU encounters PG_isolated
-     non-LRU movable page, it can skip it. Driver doesn't need to manipulate the
-     flag because VM will set/clear it automatically. Keep in mind that if the
-     driver sees a PG_isolated page, it means the page has been isolated by the
-     VM so it shouldn't touch the page.lru field.
-     The PG_isolated flag is aliased with the PG_reclaim flag so drivers
-     shouldn't use PG_isolated for its own purposes.
-
-Monitoring Migration
-=====================
-
-The following events (counters) can be used to monitor page migration.
-
-1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
-   page was migrated. If the page was a non-THP and non-hugetlb page, then
-   this counter is increased by one. If the page was a THP or hugetlb, then
-   this counter is increased by the number of THP or hugetlb subpages.
-   For example, migration of a single 2MB THP that has 4KB-size base pages
-   (subpages) will cause this counter to increase by 512.
-
-2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
-   PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
-   if it was a THP or hugetlb.
-
-3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
-
-4. THP_MIGRATION_FAIL: A THP could not be migrated nor it could be split.
-
-5. THP_MIGRATION_SPLIT: A THP was migrated, but not as such: first, the THP had
-   to be split. After splitting, a migration retry was used for it's sub-pages.
-
-THP_MIGRATION_* events also update the appropriate PGMIGRATE_SUCCESS or
-PGMIGRATE_FAIL events. For example, a THP migration failure will cause both
-THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase.
-
-Christoph Lameter, May 8, 2006.
-Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
deleted file mode 100644
index f5c954afe97c..000000000000
--- a/Documentation/vm/page_owner.rst
+++ /dev/null
@@ -1,196 +0,0 @@
-.. _page_owner:
-
-==================================================
-page owner: Tracking about who allocated each page
-==================================================
-
-Introduction
-============
-
-page owner is for the tracking about who allocated each page.
-It can be used to debug memory leak or to find a memory hogger.
-When allocation happens, information about allocation such as call stack
-and order of pages is stored into certain storage for each page.
-When we need to know about status of all pages, we can get and analyze
-this information.
-
-Although we already have tracepoint for tracing page allocation/free,
-using it for analyzing who allocate each page is rather complex. We need
-to enlarge the trace buffer for preventing overlapping until userspace
-program launched. And, launched program continually dump out the trace
-buffer for later analysis and it would change system behaviour with more
-possibility rather than just keeping it in memory, so bad for debugging.
-
-page owner can also be used for various purposes. For example, accurate
-fragmentation statistics can be obtained through gfp flag information of
-each page. It is already implemented and activated if page owner is
-enabled. Other usages are more than welcome.
-
-page owner is disabled by default. So, if you'd like to use it, you need
-to add "page_owner=on" to your boot cmdline. If the kernel is built
-with page owner and page owner is disabled in runtime due to not enabling
-boot option, runtime overhead is marginal. If disabled in runtime, it
-doesn't require memory to store owner information, so there is no runtime
-memory overhead. And, page owner inserts just two unlikely branches into
-the page allocator hotpath and if not enabled, then allocation is done
-like as the kernel without page owner. These two unlikely branches should
-not affect to allocation performance, especially if the static keys jump
-label patching functionality is available. Following is the kernel's code
-size change due to this facility.
-
-- Without page owner::
-
-   text    data     bss     dec     hex filename
-   48392   2333     644   51369    c8a9 mm/page_alloc.o
-
-- With page owner::
-
-   text    data     bss     dec     hex filename
-   48800   2445     644   51889    cab1 mm/page_alloc.o
-   6662     108      29    6799    1a8f mm/page_owner.o
-   1025       8       8    1041     411 mm/page_ext.o
-
-Although, roughly, 8 KB code is added in total, page_alloc.o increase by
-520 bytes and less than half of it is in hotpath. Building the kernel with
-page owner and turning it on if needed would be great option to debug
-kernel memory problem.
-
-There is one notice that is caused by implementation detail. page owner
-stores information into the memory from struct page extension. This memory
-is initialized some time later than that page allocator starts in sparse
-memory system, so, until initialization, many pages can be allocated and
-they would have no owner information. To fix it up, these early allocated
-pages are investigated and marked as allocated in initialization phase.
-Although it doesn't mean that they have the right owner information,
-at least, we can tell whether the page is allocated or not,
-more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages
-are catched and marked, although they are mostly allocated from struct
-page extension feature. Anyway, after that, no page is left in
-un-tracking state.
-
-Usage
-=====
-
-1) Build user-space helper::
-
-	cd tools/vm
-	make page_owner_sort
-
-2) Enable page owner: add "page_owner=on" to boot cmdline.
-
-3) Do the job that you want to debug.
-
-4) Analyze information from page owner::
-
-	cat /sys/kernel/debug/page_owner > page_owner_full.txt
-	./page_owner_sort page_owner_full.txt sorted_page_owner.txt
-
-   The general output of ``page_owner_full.txt`` is as follows::
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-	Page allocated via order XXX, ...
-	PFN XXX ...
-	// Detailed stack
-
-   The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
-   in buf, uses regexp to extract the page order value, counts the times
-   and pages of buf, and finally sorts them according to the parameter(s).
-
-   See the result about who allocated each page
-   in the ``sorted_page_owner.txt``. General output::
-
-	XXX times, XXX pages:
-	Page allocated via order XXX, ...
-	// Detailed stack
-
-   By default, ``page_owner_sort`` is sorted according to the times of buf.
-   If you want to sort by the page nums of buf, use the ``-m`` parameter.
-   The detailed parameters are:
-
-   fundamental function::
-
-	Sort:
-		-a		Sort by memory allocation time.
-		-m		Sort by total memory.
-		-p		Sort by pid.
-		-P		Sort by tgid.
-		-n		Sort by task command name.
-		-r		Sort by memory release time.
-		-s		Sort by stack trace.
-		-t		Sort by times (default).
-		--sort <order>	Specify sorting order.  Sorting syntax is [+|-]key[,[+|-]key[,...]].
-				Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is
-				optional since default direction is increasing numerical or lexicographic
-				order. Mixed use of abbreviated and complete-form of keys is allowed.
-
-		Examples:
-				./page_owner_sort <input> <output> --sort=n,+pid,-tgid
-				./page_owner_sort <input> <output> --sort=at
-
-   additional function::
-
-	Cull:
-		--cull <rules>
-				Specify culling rules.Culling syntax is key[,key[,...]].Choose a
-				multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
-
-		<rules> is a single argument in the form of a comma-separated list,
-		which offers a way to specify individual culling rules.  The recognized
-		keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
-		<rules> can be specified by the sequence of keys k1,k2, ..., as described in
-		the STANDARD SORT KEYS section below. Mixed use of abbreviated and
-		complete-form of keys is allowed.
-
-		Examples:
-				./page_owner_sort <input> <output> --cull=stacktrace
-				./page_owner_sort <input> <output> --cull=st,pid,name
-				./page_owner_sort <input> <output> --cull=n,f
-
-	Filter:
-		-f		Filter out the information of blocks whose memory has been released.
-
-	Select:
-		--pid <pidlist>		Select by pid. This selects the blocks whose process ID
-					numbers appear in <pidlist>.
-		--tgid <tgidlist>	Select by tgid. This selects the blocks whose thread
-					group ID numbers appear in <tgidlist>.
-		--name <cmdlist>	Select by task command name. This selects the blocks whose
-					task command name appear in <cmdlist>.
-
-		<pidlist>, <tgidlist>, <cmdlist> are single arguments in the form of a comma-separated list,
-		which offers a way to specify individual selecting rules.
-
-
-		Examples:
-				./page_owner_sort <input> <output> --pid=1
-				./page_owner_sort <input> <output> --tgid=1,2,3
-				./page_owner_sort <input> <output> --name name1,name2
-
-STANDARD FORMAT SPECIFIERS
-==========================
-::
-
-  For --sort option:
-
-	KEY		LONG		DESCRIPTION
-	p		pid		process ID
-	tg		tgid		thread group ID
-	n		name		task command name
-	st		stacktrace	stack trace of the page allocation
-	T		txt		full text of block
-	ft		free_ts		timestamp of the page when it was released
-	at		alloc_ts	timestamp of the page when it was allocated
-	ator		allocator	memory allocator for pages
-
-  For --curl option:
-
-	KEY		LONG		DESCRIPTION
-	p		pid		process ID
-	tg		tgid		thread group ID
-	n		name		task command name
-	f		free		whether the page has been released or not
-	st		stacktrace	stack trace of the page allocation
-	ator		allocator	memory allocator for pages
diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst
deleted file mode 100644
index 50a30b7f8ac3..000000000000
--- a/Documentation/vm/page_reclaim.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-============
-Page Reclaim
-============
diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst
deleted file mode 100644
index 1a09472f10a3..000000000000
--- a/Documentation/vm/page_table_check.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. _page_table_check:
-
-================
-Page Table Check
-================
-
-Introduction
-============
-
-Page table check allows to harden the kernel by ensuring that some types of
-the memory corruptions are prevented.
-
-Page table check performs extra verifications at the time when new pages become
-accessible from the userspace by getting their page table entries (PTEs PMDs
-etc.) added into the table.
-
-In case of detected corruption, the kernel is crashed. There is a small
-performance and memory overhead associated with the page table check. Therefore,
-it is disabled by default, but can be optionally enabled on systems where the
-extra hardening outweighs the performance costs. Also, because page table check
-is synchronous, it can help with debugging double map memory corruption issues,
-by crashing kernel at the time wrong mapping occurs instead of later which is
-often the case with memory corruptions bugs.
-
-Double mapping detection logic
-==============================
-
-+-------------------+-------------------+-------------------+------------------+
-| Current Mapping   | New mapping       | Permissions       | Rule             |
-+===================+===================+===================+==================+
-| Anonymous         | Anonymous         | Read              | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Anonymous         | Read / Write      | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Anonymous         | Named             | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Anonymous         | Any               | Prohibit         |
-+-------------------+-------------------+-------------------+------------------+
-| Named             | Named             | Any               | Allow            |
-+-------------------+-------------------+-------------------+------------------+
-
-Enabling Page Table Check
-=========================
-
-Build kernel with:
-
-- PAGE_TABLE_CHECK=y
-  Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
-  is available.
-
-- Boot with 'page_table_check=on' kernel parameter.
-
-Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
-table support without extra kernel parameter.
diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst
deleted file mode 100644
index 96939571d7bc..000000000000
--- a/Documentation/vm/page_tables.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===========
-Page Tables
-===========
diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst
deleted file mode 100644
index 2ab7b8c1c863..000000000000
--- a/Documentation/vm/physical_memory.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Physical Memory
-===============
diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst
deleted file mode 100644
index e8618fbc62c9..000000000000
--- a/Documentation/vm/process_addrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=================
-Process Addresses
-=================
diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/vm/remap_file_pages.rst
deleted file mode 100644
index 7bef6718e3a9..000000000000
--- a/Documentation/vm/remap_file_pages.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. _remap_file_pages:
-
-==============================
-remap_file_pages() system call
-==============================
-
-The remap_file_pages() system call is used to create a nonlinear mapping,
-that is, a mapping in which the pages of the file are mapped into a
-nonsequential order in memory. The advantage of using remap_file_pages()
-over using repeated calls to mmap(2) is that the former approach does not
-require the kernel to create additional VMA (Virtual Memory Area) data
-structures.
-
-Supporting of nonlinear mapping requires significant amount of non-trivial
-code in kernel virtual memory subsystem including hot paths. Also to get
-nonlinear mapping work kernel need a way to distinguish normal page table
-entries from entries with file offset (pte_file). Kernel reserves flag in
-PTE for this purpose. PTE flags are scarce resource especially on some CPU
-architectures. It would be nice to free up the flag for other usage.
-
-Fortunately, there are not many users of remap_file_pages() in the wild.
-It's only known that one enterprise RDBMS implementation uses the syscall
-on 32-bit systems to map files bigger than can linearly fit into 32-bit
-virtual address space. This use-case is not critical anymore since 64-bit
-systems are widely available.
-
-The syscall is deprecated and replaced it with an emulation now. The
-emulation creates new VMAs instead of nonlinear mappings. It's going to
-work slower for rare users of remap_file_pages() but ABI is preserved.
-
-One side effect of emulation (apart from performance) is that user can hit
-vm.max_map_count limit more easily due to additional VMAs. See comment for
-DEFAULT_MAX_MAP_COUNT for more details on the limit.
diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst
deleted file mode 100644
index 8b01ebb4c30e..000000000000
--- a/Documentation/vm/shmfs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-========================
-Shared Memory Filesystem
-========================
diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst
deleted file mode 100644
index 87d5a5bb172f..000000000000
--- a/Documentation/vm/slab.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Slab Allocation
-===============
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
deleted file mode 100644
index 43063ade737a..000000000000
--- a/Documentation/vm/slub.rst
+++ /dev/null
@@ -1,452 +0,0 @@
-.. _slub:
-
-==========================
-Short users guide for SLUB
-==========================
-
-The basic philosophy of SLUB is very different from SLAB. SLAB
-requires rebuilding the kernel to activate debug options for all
-slab caches. SLUB always includes full debugging but it is off by default.
-SLUB can enable debugging only for selected slabs in order to avoid
-an impact on overall system performance which may make a bug more
-difficult to find.
-
-In order to switch debugging on one can add an option ``slub_debug``
-to the kernel command line. That will enable full debugging for
-all slabs.
-
-Typically one would then use the ``slabinfo`` command to get statistical
-data and perform operation on the slabs. By default ``slabinfo`` only lists
-slabs that have data in them. See "slabinfo -h" for more options when
-running the command. ``slabinfo`` can be compiled with
-::
-
-	gcc -o slabinfo tools/vm/slabinfo.c
-
-Some of the modes of operation of ``slabinfo`` require that slub debugging
-be enabled on the command line. F.e. no tracking information will be
-available without debugging on and validation can only partially
-be performed if debugging was not switched on.
-
-Some more sophisticated uses of slub_debug:
--------------------------------------------
-
-Parameters may be given to ``slub_debug``. If none is specified then full
-debugging is enabled. Format:
-
-slub_debug=<Debug-Options>
-	Enable options for all slabs
-
-slub_debug=<Debug-Options>,<slab name1>,<slab name2>,...
-	Enable options only for select slabs (no spaces
-	after a comma)
-
-Multiple blocks of options for all slabs or selected slabs can be given, with
-blocks of options delimited by ';'. The last of "all slabs" blocks is applied
-to all slabs except those that match one of the "select slabs" block. Options
-of the first "select slabs" blocks that matches the slab's name are applied.
-
-Possible debug options are::
-
-	F		Sanity checks on (enables SLAB_DEBUG_CONSISTENCY_CHECKS
-			Sorry SLAB legacy issues)
-	Z		Red zoning
-	P		Poisoning (object and padding)
-	U		User tracking (free and alloc)
-	T		Trace (please only use on single slabs)
-	A		Enable failslab filter mark for the cache
-	O		Switch debugging off for caches that would have
-			caused higher minimum slab orders
-	-		Switch all debugging off (useful if the kernel is
-			configured with CONFIG_SLUB_DEBUG_ON)
-
-F.e. in order to boot just with sanity checks and red zoning one would specify::
-
-	slub_debug=FZ
-
-Trying to find an issue in the dentry cache? Try::
-
-	slub_debug=,dentry
-
-to only enable debugging on the dentry cache.  You may use an asterisk at the
-end of the slab name, in order to cover all slabs with the same prefix.  For
-example, here's how you can poison the dentry cache as well as all kmalloc
-slabs::
-
-	slub_debug=P,kmalloc-*,dentry
-
-Red zoning and tracking may realign the slab.  We can just apply sanity checks
-to the dentry cache with::
-
-	slub_debug=F,dentry
-
-Debugging options may require the minimum possible slab order to increase as
-a result of storing the metadata (for example, caches with PAGE_SIZE object
-sizes).  This has a higher liklihood of resulting in slab allocation errors
-in low memory situations or if there's high fragmentation of memory.  To
-switch off debugging for such caches by default, use::
-
-	slub_debug=O
-
-You can apply different options to different list of slab names, using blocks
-of options. This will enable red zoning for dentry and user tracking for
-kmalloc. All other slabs will not get any debugging enabled::
-
-	slub_debug=Z,dentry;U,kmalloc-*
-
-You can also enable options (e.g. sanity checks and poisoning) for all caches
-except some that are deemed too performance critical and don't need to be
-debugged by specifying global debug options followed by a list of slab names
-with "-" as options::
-
-	slub_debug=FZ;-,zs_handle,zspage
-
-The state of each debug option for a slab can be found in the respective files
-under::
-
-	/sys/kernel/slab/<slab name>/
-
-If the file contains 1, the option is enabled, 0 means disabled. The debug
-options from the ``slub_debug`` parameter translate to the following files::
-
-	F	sanity_checks
-	Z	red_zone
-	P	poison
-	U	store_user
-	T	trace
-	A	failslab
-
-Careful with tracing: It may spew out lots of information and never stop if
-used on the wrong slab.
-
-Slab merging
-============
-
-If no debug options are specified then SLUB may merge similar slabs together
-in order to reduce overhead and increase cache hotness of objects.
-``slabinfo -a`` displays which slabs were merged together.
-
-Slab validation
-===============
-
-SLUB can validate all object if the kernel was booted with slub_debug. In
-order to do so you must have the ``slabinfo`` tool. Then you can do
-::
-
-	slabinfo -v
-
-which will test all objects. Output will be generated to the syslog.
-
-This also works in a more limited way if boot was without slab debug.
-In that case ``slabinfo -v`` simply tests all reachable objects. Usually
-these are in the cpu slabs and the partial slabs. Full slabs are not
-tracked by SLUB in a non debug situation.
-
-Getting more performance
-========================
-
-To some degree SLUB's performance is limited by the need to take the
-list_lock once in a while to deal with partial slabs. That overhead is
-governed by the order of the allocation for each slab. The allocations
-can be influenced by kernel parameters:
-
-.. slub_min_objects=x		(default 4)
-.. slub_min_order=x		(default 0)
-.. slub_max_order=x		(default 3 (PAGE_ALLOC_COSTLY_ORDER))
-
-``slub_min_objects``
-	allows to specify how many objects must at least fit into one
-	slab in order for the allocation order to be acceptable.  In
-	general slub will be able to perform this number of
-	allocations on a slab without consulting centralized resources
-	(list_lock) where contention may occur.
-
-``slub_min_order``
-	specifies a minimum order of slabs. A similar effect like
-	``slub_min_objects``.
-
-``slub_max_order``
-	specified the order at which ``slub_min_objects`` should no
-	longer be checked. This is useful to avoid SLUB trying to
-	generate super large order pages to fit ``slub_min_objects``
-	of a slab cache with large object sizes into one high order
-	page. Setting command line parameter
-	``debug_guardpage_minorder=N`` (N > 0), forces setting
-	``slub_max_order`` to 0, what cause minimum possible order of
-	slabs allocation.
-
-SLUB Debug output
-=================
-
-Here is a sample of slub debug output::
-
- ====================================================================
- BUG kmalloc-8: Right Redzone overwritten
- --------------------------------------------------------------------
-
- INFO: 0xc90f6d28-0xc90f6d2b. First byte 0x00 instead of 0xcc
- INFO: Slab 0xc528c530 flags=0x400000c3 inuse=61 fp=0xc90f6d58
- INFO: Object 0xc90f6d20 @offset=3360 fp=0xc90f6d58
- INFO: Allocated in get_modalias+0x61/0xf5 age=53 cpu=1 pid=554
-
- Bytes b4 (0xc90f6d10): 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ
- Object   (0xc90f6d20): 31 30 31 39 2e 30 30 35                         1019.005
- Redzone  (0xc90f6d28): 00 cc cc cc                                     .
- Padding  (0xc90f6d50): 5a 5a 5a 5a 5a 5a 5a 5a                         ZZZZZZZZ
-
-   [<c010523d>] dump_trace+0x63/0x1eb
-   [<c01053df>] show_trace_log_lvl+0x1a/0x2f
-   [<c010601d>] show_trace+0x12/0x14
-   [<c0106035>] dump_stack+0x16/0x18
-   [<c017e0fa>] object_err+0x143/0x14b
-   [<c017e2cc>] check_object+0x66/0x234
-   [<c017eb43>] __slab_free+0x239/0x384
-   [<c017f446>] kfree+0xa6/0xc6
-   [<c02e2335>] get_modalias+0xb9/0xf5
-   [<c02e23b7>] dmi_dev_uevent+0x27/0x3c
-   [<c027866a>] dev_uevent+0x1ad/0x1da
-   [<c0205024>] kobject_uevent_env+0x20a/0x45b
-   [<c020527f>] kobject_uevent+0xa/0xf
-   [<c02779f1>] store_uevent+0x4f/0x58
-   [<c027758e>] dev_attr_store+0x29/0x2f
-   [<c01bec4f>] sysfs_write_file+0x16e/0x19c
-   [<c0183ba7>] vfs_write+0xd1/0x15a
-   [<c01841d7>] sys_write+0x3d/0x72
-   [<c0104112>] sysenter_past_esp+0x5f/0x99
-   [<b7f7b410>] 0xb7f7b410
-   =======================
-
- FIX kmalloc-8: Restoring Redzone 0xc90f6d28-0xc90f6d2b=0xcc
-
-If SLUB encounters a corrupted object (full detection requires the kernel
-to be booted with slub_debug) then the following output will be dumped
-into the syslog:
-
-1. Description of the problem encountered
-
-   This will be a message in the system log starting with::
-
-     ===============================================
-     BUG <slab cache affected>: <What went wrong>
-     -----------------------------------------------
-
-     INFO: <corruption start>-<corruption_end> <more info>
-     INFO: Slab <address> <slab information>
-     INFO: Object <address> <object information>
-     INFO: Allocated in <kernel function> age=<jiffies since alloc> cpu=<allocated by
-	cpu> pid=<pid of the process>
-     INFO: Freed in <kernel function> age=<jiffies since free> cpu=<freed by cpu>
-	pid=<pid of the process>
-
-   (Object allocation / free information is only available if SLAB_STORE_USER is
-   set for the slab. slub_debug sets that option)
-
-2. The object contents if an object was involved.
-
-   Various types of lines can follow the BUG SLUB line:
-
-   Bytes b4 <address> : <bytes>
-	Shows a few bytes before the object where the problem was detected.
-	Can be useful if the corruption does not stop with the start of the
-	object.
-
-   Object <address> : <bytes>
-	The bytes of the object. If the object is inactive then the bytes
-	typically contain poison values. Any non-poison value shows a
-	corruption by a write after free.
-
-   Redzone <address> : <bytes>
-	The Redzone following the object. The Redzone is used to detect
-	writes after the object. All bytes should always have the same
-	value. If there is any deviation then it is due to a write after
-	the object boundary.
-
-	(Redzone information is only available if SLAB_RED_ZONE is set.
-	slub_debug sets that option)
-
-   Padding <address> : <bytes>
-	Unused data to fill up the space in order to get the next object
-	properly aligned. In the debug case we make sure that there are
-	at least 4 bytes of padding. This allows the detection of writes
-	before the object.
-
-3. A stackdump
-
-   The stackdump describes the location where the error was detected. The cause
-   of the corruption is may be more likely found by looking at the function that
-   allocated or freed the object.
-
-4. Report on how the problem was dealt with in order to ensure the continued
-   operation of the system.
-
-   These are messages in the system log beginning with::
-
-	FIX <slab cache affected>: <corrective action taken>
-
-   In the above sample SLUB found that the Redzone of an active object has
-   been overwritten. Here a string of 8 characters was written into a slab that
-   has the length of 8 characters. However, a 8 character string needs a
-   terminating 0. That zero has overwritten the first byte of the Redzone field.
-   After reporting the details of the issue encountered the FIX SLUB message
-   tells us that SLUB has restored the Redzone to its proper value and then
-   system operations continue.
-
-Emergency operations
-====================
-
-Minimal debugging (sanity checks alone) can be enabled by booting with::
-
-	slub_debug=F
-
-This will be generally be enough to enable the resiliency features of slub
-which will keep the system running even if a bad kernel component will
-keep corrupting objects. This may be important for production systems.
-Performance will be impacted by the sanity checks and there will be a
-continual stream of error messages to the syslog but no additional memory
-will be used (unlike full debugging).
-
-No guarantees. The kernel component still needs to be fixed. Performance
-may be optimized further by locating the slab that experiences corruption
-and enabling debugging only for that cache
-
-I.e.::
-
-	slub_debug=F,dentry
-
-If the corruption occurs by writing after the end of the object then it
-may be advisable to enable a Redzone to avoid corrupting the beginning
-of other objects::
-
-	slub_debug=FZ,dentry
-
-Extended slabinfo mode and plotting
-===================================
-
-The ``slabinfo`` tool has a special 'extended' ('-X') mode that includes:
- - Slabcache Totals
- - Slabs sorted by size (up to -N <num> slabs, default 1)
- - Slabs sorted by loss (up to -N <num> slabs, default 1)
-
-Additionally, in this mode ``slabinfo`` does not dynamically scale
-sizes (G/M/K) and reports everything in bytes (this functionality is
-also available to other slabinfo modes via '-B' option) which makes
-reporting more precise and accurate. Moreover, in some sense the `-X'
-mode also simplifies the analysis of slabs' behaviour, because its
-output can be plotted using the ``slabinfo-gnuplot.sh`` script. So it
-pushes the analysis from looking through the numbers (tons of numbers)
-to something easier -- visual analysis.
-
-To generate plots:
-
-a) collect slabinfo extended records, for example::
-
-	while [ 1 ]; do slabinfo -X >> FOO_STATS; sleep 1; done
-
-b) pass stats file(-s) to ``slabinfo-gnuplot.sh`` script::
-
-	slabinfo-gnuplot.sh FOO_STATS [FOO_STATS2 .. FOO_STATSN]
-
-   The ``slabinfo-gnuplot.sh`` script will pre-processes the collected records
-   and generates 3 png files (and 3 pre-processing cache files) per STATS
-   file:
-   - Slabcache Totals: FOO_STATS-totals.png
-   - Slabs sorted by size: FOO_STATS-slabs-by-size.png
-   - Slabs sorted by loss: FOO_STATS-slabs-by-loss.png
-
-Another use case, when ``slabinfo-gnuplot.sh`` can be useful, is when you
-need to compare slabs' behaviour "prior to" and "after" some code
-modification.  To help you out there, ``slabinfo-gnuplot.sh`` script
-can 'merge' the `Slabcache Totals` sections from different
-measurements. To visually compare N plots:
-
-a) Collect as many STATS1, STATS2, .. STATSN files as you need::
-
-	while [ 1 ]; do slabinfo -X >> STATS<X>; sleep 1; done
-
-b) Pre-process those STATS files::
-
-	slabinfo-gnuplot.sh STATS1 STATS2 .. STATSN
-
-c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
-   generated pre-processed \*-totals::
-
-	slabinfo-gnuplot.sh -t STATS1-totals STATS2-totals .. STATSN-totals
-
-   This will produce a single plot (png file).
-
-   Plots, expectedly, can be large so some fluctuations or small spikes
-   can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two
-   options to 'zoom-in'/'zoom-out':
-
-   a) ``-s %d,%d`` -- overwrites the default image width and height
-   b) ``-r %d,%d`` -- specifies a range of samples to use (for example,
-      in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r
-      40,60`` range will plot only samples collected between 40th and
-      60th seconds).
-
-
-DebugFS files for SLUB
-======================
-
-For more information about current state of SLUB caches with the user tracking
-debug option enabled, debugfs files are available, typically under
-/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
-tracking). There are 2 types of these files with the following debug
-information:
-
-1. alloc_traces::
-
-    Prints information about unique allocation traces of the currently
-    allocated objects. The output is sorted by frequency of each trace.
-
-    Information in the output:
-    Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
-    pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
-
-    Example:::
-
-    1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
-	__slab_alloc+0x6d/0x90
-	kmem_cache_alloc_trace+0x2eb/0x300
-	populate_error_injection_list+0x97/0x110
-	init_error_injection+0x1b/0x71
-	do_one_initcall+0x5f/0x2d0
-	kernel_init_freeable+0x26f/0x2d7
-	kernel_init+0xe/0x118
-	ret_from_fork+0x22/0x30
-
-
-2. free_traces::
-
-    Prints information about unique freeing traces of the currently allocated
-    objects. The freeing traces thus come from the previous life-cycle of the
-    objects and are reported as not available for objects allocated for the first
-    time. The output is sorted by frequency of each trace.
-
-    Information in the output:
-    Number of objects, freeing function, minimal/average/maximal jiffies since free,
-    pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
-
-    Example:::
-
-    1980 <not-available> age=4294912290 pid=0 cpus=0
-    51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
-	kfree+0x2db/0x420
-	acpi_ut_update_ref_count+0x6a6/0x782
-	acpi_ut_update_object_reference+0x1ad/0x234
-	acpi_ut_remove_reference+0x7d/0x84
-	acpi_rs_get_prt_method_data+0x97/0xd6
-	acpi_get_irq_routing_table+0x82/0xc4
-	acpi_pci_irq_find_prt_entry+0x8e/0x2e0
-	acpi_pci_irq_lookup+0x3a/0x1e0
-	acpi_pci_irq_enable+0x77/0x240
-	pcibios_enable_device+0x39/0x40
-	do_pci_enable_device.part.0+0x5d/0xe0
-	pci_enable_device_flags+0xfc/0x120
-	pci_enable_device+0x13/0x20
-	virtio_pci_probe+0x9e/0x170
-	local_pci_probe+0x48/0x80
-	pci_device_probe+0x105/0x1c0
-
-Christoph Lameter, May 30, 2007
-Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst
deleted file mode 100644
index c08919662704..000000000000
--- a/Documentation/vm/split_page_table_lock.rst
+++ /dev/null
@@ -1,100 +0,0 @@
-.. _split_page_table_lock:
-
-=====================
-Split page table lock
-=====================
-
-Originally, mm->page_table_lock spinlock protected all page tables of the
-mm_struct. But this approach leads to poor page fault scalability of
-multi-threaded applications due high contention on the lock. To improve
-scalability, split page table lock was introduced.
-
-With split page table lock we have separate per-table lock to serialize
-access to the table. At the moment we use split lock for PTE and PMD
-tables. Access to higher level tables protected by mm->page_table_lock.
-
-There are helpers to lock/unlock a table and other accessor functions:
-
- - pte_offset_map_lock()
-	maps pte and takes PTE table lock, returns pointer to the taken
-	lock;
- - pte_unmap_unlock()
-	unlocks and unmaps PTE table;
- - pte_alloc_map_lock()
-	allocates PTE table if needed and take the lock, returns pointer
-	to taken lock or NULL if allocation failed;
- - pte_lockptr()
-	returns pointer to PTE table lock;
- - pmd_lock()
-	takes PMD table lock, returns pointer to taken lock;
- - pmd_lockptr()
-	returns pointer to PMD table lock;
-
-Split page table lock for PTE tables is enabled compile-time if
-CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
-If split lock is disabled, all tables are guarded by mm->page_table_lock.
-
-Split page table lock for PMD tables is enabled, if it's enabled for PTE
-tables and the architecture supports it (see below).
-
-Hugetlb and split page table lock
-=================================
-
-Hugetlb can support several page sizes. We use split lock only for PMD
-level, but not for PUD.
-
-Hugetlb-specific helpers:
-
- - huge_pte_lock()
-	takes pmd split lock for PMD_SIZE page, mm->page_table_lock
-	otherwise;
- - huge_pte_lockptr()
-	returns pointer to table lock;
-
-Support of split page table lock by an architecture
-===================================================
-
-There's no need in special enabling of PTE split page table lock: everything
-required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
-must be called on PTE table allocation / freeing.
-
-Make sure the architecture doesn't use slab allocator for page table
-allocation: slab uses page->slab_cache for its pages.
-This field shares storage with page->ptl.
-
-PMD split lock only makes sense if you have more than two page table
-levels.
-
-PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
-allocation and pgtable_pmd_page_dtor() on freeing.
-
-Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
-pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
-paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
-
-With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
-
-NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
-be handled properly.
-
-page->ptl
-=========
-
-page->ptl is used to access split page table lock, where 'page' is struct
-page of page containing the table. It shares storage with page->private
-(and few other fields in union).
-
-To avoid increasing size of struct page and have best performance, we use a
-trick:
-
- - if spinlock_t fits into long, we use page->ptr as spinlock, so we
-   can avoid indirect access and save a cache line.
- - if size of spinlock_t is bigger then size of long, we use page->ptl as
-   pointer to spinlock_t and allocate it dynamically. This allows to use
-   split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
-   one more cache line for indirect access;
-
-The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
-pgtable_pmd_page_ctor() for PMD table.
-
-Please, never access page->ptl directly -- use appropriate helper.
diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst
deleted file mode 100644
index 78819bd4d745..000000000000
--- a/Documentation/vm/swap.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-====
-Swap
-====
diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
deleted file mode 100644
index 216db1d67d04..000000000000
--- a/Documentation/vm/transhuge.rst
+++ /dev/null
@@ -1,187 +0,0 @@
-.. _transhuge:
-
-============================
-Transparent Hugepage Support
-============================
-
-This document describes design principles for Transparent Hugepage (THP)
-support and its interaction with other parts of the memory management
-system.
-
-Design principles
-=================
-
-- "graceful fallback": mm components which don't have transparent hugepage
-  knowledge fall back to breaking huge pmd mapping into table of ptes and,
-  if necessary, split a transparent hugepage. Therefore these components
-  can continue working on the regular pages or regular pte mappings.
-
-- if a hugepage allocation fails because of memory fragmentation,
-  regular pages should be gracefully allocated instead and mixed in
-  the same vma without any failure or significant delay and without
-  userland noticing
-
-- if some task quits and more hugepages become available (either
-  immediately in the buddy or through the VM), guest physical memory
-  backed by regular pages should be relocated on hugepages
-  automatically (with khugepaged)
-
-- it doesn't require memory reservation and in turn it uses hugepages
-  whenever possible (the only possible reservation here is kernelcore=
-  to avoid unmovable pages to fragment all the memory but such a tweak
-  is not specific to transparent hugepage support and it's a generic
-  feature that applies to all dynamic high order allocations in the
-  kernel)
-
-get_user_pages and follow_page
-==============================
-
-get_user_pages and follow_page if run on a hugepage, will return the
-head or tail pages as usual (exactly as they would do on
-hugetlbfs). Most GUP users will only care about the actual physical
-address of the page and its temporary pinning to release after the I/O
-is complete, so they won't ever notice the fact the page is huge. But
-if any driver is going to mangle over the page structure of the tail
-page (like for checking page->mapping or other bits that are relevant
-for the head page and not the tail page), it should be updated to jump
-to check head page instead. Taking a reference on any head/tail page would
-prevent the page from being split by anyone.
-
-.. note::
-   these aren't new constraints to the GUP API, and they match the
-   same constraints that apply to hugetlbfs too, so any driver capable
-   of handling GUP on hugetlbfs will also work fine on transparent
-   hugepage backed mappings.
-
-Graceful fallback
-=================
-
-Code walking pagetables but unaware about huge pmds can simply call
-split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
-pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_pmd where
-missing after pmd_offset returns the pmd. Thanks to the graceful
-fallback design, with a one liner change, you can avoid to write
-hundreds if not thousands of lines of complex code to make your code
-hugepage aware.
-
-If you're not walking pagetables but you run into a physical hugepage
-that you can't handle natively in your code, you can split it by
-calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example. split_huge_page() can fail
-if the page is pinned and you must handle this correctly.
-
-Example to make mremap.c transparent hugepage aware with a one liner
-change::
-
-	diff --git a/mm/mremap.c b/mm/mremap.c
-	--- a/mm/mremap.c
-	+++ b/mm/mremap.c
-	@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
-			return NULL;
-
-		pmd = pmd_offset(pud, addr);
-	+	split_huge_pmd(vma, pmd, addr);
-		if (pmd_none_or_clear_bad(pmd))
-			return NULL;
-
-Locking in hugepage aware code
-==============================
-
-We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_pmd() has a cost.
-
-To make pagetable walks huge pmd aware, all you need to do is to call
-pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
-mmap_lock in read (or write) mode to be sure a huge pmd cannot be
-created from under you by khugepaged (khugepaged collapse_huge_page
-takes the mmap_lock in write mode in addition to the anon_vma lock). If
-pmd_trans_huge returns false, you just fallback in the old code
-paths. If instead pmd_trans_huge returns true, you have to take the
-page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
-page table lock will prevent the huge pmd being converted into a
-regular pmd from under you (split_huge_pmd can run in parallel to the
-pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page table lock and fallback to the old code as
-before. Otherwise, you can proceed to process the huge pmd and the
-hugepage natively. Once finished, you can drop the page table lock.
-
-Refcounts and transparent huge pages
-====================================
-
-Refcounting on THP is mostly consistent with refcounting on other compound
-pages:
-
-  - get_page()/put_page() and GUP operate on head page's ->_refcount.
-
-  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
-    succeeds on tail pages.
-
-  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
-    on relevant sub-page of the compound page.
-
-  - map/unmap of the whole compound page is accounted for in compound_mapcount
-    (stored in first tail page). For file huge pages, we also increment
-    ->_mapcount of all sub-pages in order to have race-free detection of
-    last unmap of subpages.
-
-PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
-
-For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all
-subpages is offset up by one. This additional reference is required to
-get race-free detection of unmap of subpages when we have them mapped with
-both PMDs and PTEs.
-
-This optimization is required to lower the overhead of per-subpage mapcount
-tracking. The alternative is to alter ->_mapcount in all subpages on each
-map/unmap of the whole compound page.
-
-For anonymous pages, we set PG_double_map when a PMD of the page is split
-for the first time, but still have a PMD mapping. The additional references
-go away with the last compound_mapcount.
-
-File pages get PG_double_map set on the first map of the page with PTE and
-goes away when the page gets evicted from the page cache.
-
-split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the page
-structures. It can be done easily for refcounts taken by page table
-entries, but we don't have enough information on how to distribute any
-additional pins (i.e. from get_user_pages). split_huge_page() fails any
-requests to split pinned huge pages: it expects page count to be equal to
-the sum of mapcount of all sub-pages plus one (split_huge_page caller must
-have a reference to the head page).
-
-split_huge_page uses migration entries to stabilize page->_refcount and
-page->_mapcount of anonymous pages. File pages just get unmapped.
-
-We are safe against physical memory scanners too: the only legitimate way
-a scanner can get a reference to a page is get_page_unless_zero().
-
-All tail pages have zero ->_refcount until atomic_add(). This prevents the
-scanner from getting a reference to the tail page up to that point. After the
-atomic_add() we don't care about the ->_refcount value. We already know how
-many references should be uncharged from the head page.
-
-For head page get_page_unless_zero() will succeed and we don't mind. It's
-clear where references should go after split: it will stay on the head page.
-
-Note that split_huge_pmd() doesn't have any limitations on refcounting:
-pmd can be split at any point and never fails.
-
-Partial unmap and deferred_split_huge_page()
-============================================
-
-Unmapping part of THP (with munmap() or other way) is not going to free
-memory immediately. Instead, we detect that a subpage of THP is not in use
-in page_remove_rmap() and queue the THP for splitting if memory pressure
-comes. Splitting will free up unused subpages.
-
-Splitting the page right away is not an option due to locking context in
-the place where we can detect partial unmap. It also might be
-counterproductive since in many cases partial unmap happens during exit(2) if
-a THP crosses a VMA boundary.
-
-The function deferred_split_huge_page() is used to queue a page for splitting.
-The splitting itself will happen when we get memory pressure via shrinker
-interface.
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
deleted file mode 100644
index b280367d6a44..000000000000
--- a/Documentation/vm/unevictable-lru.rst
+++ /dev/null
@@ -1,554 +0,0 @@
-.. _unevictable_lru:
-
-==============================
-Unevictable LRU Infrastructure
-==============================
-
-.. contents:: :local:
-
-
-Introduction
-============
-
-This document describes the Linux memory manager's "Unevictable LRU"
-infrastructure and the use of this to manage several types of "unevictable"
-pages.
-
-The document attempts to provide the overall rationale behind this mechanism
-and the rationale for some of the design decisions that drove the
-implementation.  The latter design rationale is discussed in the context of an
-implementation description.  Admittedly, one can obtain the implementation
-details - the "what does it do?" - by reading the code.  One hopes that the
-descriptions below add value by provide the answer to "why does it do that?".
-
-
-
-The Unevictable LRU
-===================
-
-The Unevictable LRU facility adds an additional LRU list to track unevictable
-pages and to hide these pages from vmscan.  This mechanism is based on a patch
-by Larry Woodman of Red Hat to address several scalability problems with page
-reclaim in Linux.  The problems have been observed at customer sites on large
-memory x86_64 systems.
-
-To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
-main memory will have over 32 million 4k pages in a single node.  When a large
-fraction of these pages are not evictable for any reason [see below], vmscan
-will spend a lot of time scanning the LRU lists looking for the small fraction
-of pages that are evictable.  This can result in a situation where all CPUs are
-spending 100% of their time in vmscan for hours or days on end, with the system
-completely unresponsive.
-
-The unevictable list addresses the following classes of unevictable pages:
-
- * Those owned by ramfs.
-
- * Those mapped into SHM_LOCK'd shared memory regions.
-
- * Those mapped into VM_LOCKED [mlock()ed] VMAs.
-
-The infrastructure may also be able to handle other conditions that make pages
-unevictable, either by definition or by circumstance, in the future.
-
-
-The Unevictable LRU Page List
------------------------------
-
-The Unevictable LRU page list is a lie.  It was never an LRU-ordered list, but a
-companion to the LRU-ordered anonymous and file, active and inactive page lists;
-and now it is not even a page list.  But following familiar convention, here in
-this document and in the source, we often imagine it as a fifth LRU page list.
-
-The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
-called the "unevictable" list and an associated page flag, PG_unevictable, to
-indicate that the page is being managed on the unevictable list.
-
-The PG_unevictable flag is analogous to, and mutually exclusive with, the
-PG_active flag in that it indicates on which LRU list a page resides when
-PG_lru is set.
-
-The Unevictable LRU infrastructure maintains unevictable pages as if they were
-on an additional LRU list for a few reasons:
-
- (1) We get to "treat unevictable pages just like we treat other pages in the
-     system - which means we get to use the same code to manipulate them, the
-     same code to isolate them (for migrate, etc.), the same code to keep track
-     of the statistics, etc..." [Rik van Riel]
-
- (2) We want to be able to migrate unevictable pages between nodes for memory
-     defragmentation, workload management and memory hotplug.  The Linux kernel
-     can only migrate pages that it can successfully isolate from the LRU
-     lists (or "Movable" pages: outside of consideration here).  If we were to
-     maintain pages elsewhere than on an LRU-like list, where they can be
-     detected by isolate_lru_page(), we would prevent their migration.
-
-The unevictable list does not differentiate between file-backed and anonymous,
-swap-backed pages.  This differentiation is only important while the pages are,
-in fact, evictable.
-
-The unevictable list benefits from the "arrayification" of the per-node LRU
-lists and statistics originally proposed and posted by Christoph Lameter.
-
-
-Memory Control Group Interaction
---------------------------------
-
-The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by
-extending the lru_list enum.
-
-The memory controller data structure automatically gets a per-node unevictable
-list as a result of the "arrayification" of the per-node LRU lists (one per
-lru_list enum element).  The memory controller tracks the movement of pages to
-and from the unevictable list.
-
-When a memory control group comes under memory pressure, the controller will
-not attempt to reclaim pages on the unevictable list.  This has a couple of
-effects:
-
- (1) Because the pages are "hidden" from reclaim on the unevictable list, the
-     reclaim process can be more efficient, dealing only with pages that have a
-     chance of being reclaimed.
-
- (2) On the other hand, if too many of the pages charged to the control group
-     are unevictable, the evictable portion of the working set of the tasks in
-     the control group may not fit into the available memory.  This can cause
-     the control group to thrash or to OOM-kill tasks.
-
-
-.. _mark_addr_space_unevict:
-
-Marking Address Spaces Unevictable
-----------------------------------
-
-For facilities such as ramfs none of the pages attached to the address space
-may be evicted.  To prevent eviction of any such pages, the AS_UNEVICTABLE
-address space flag is provided, and this can be manipulated by a filesystem
-using a number of wrapper functions:
-
- * ``void mapping_set_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being completely unevictable.
-
- * ``void mapping_clear_unevictable(struct address_space *mapping);``
-
-	Mark the address space as being evictable.
-
- * ``int mapping_unevictable(struct address_space *mapping);``
-
-	Query the address space, and return true if it is completely
-	unevictable.
-
-These are currently used in three places in the kernel:
-
- (1) By ramfs to mark the address spaces of its inodes when they are created,
-     and this mark remains for the life of the inode.
-
- (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
-     Note that SHM_LOCK is not required to page in the locked pages if they're
-     swapped out; the application must touch the pages manually if it wants to
-     ensure they're in memory.
-
- (3) By the i915 driver to mark pinned address space until it's unpinned. The
-     amount of unevictable memory marked by i915 driver is roughly the bounded
-     object size in debugfs/dri/0/i915_gem_objects.
-
-
-Detecting Unevictable Pages
----------------------------
-
-The function page_evictable() in mm/internal.h determines whether a page is
-evictable or not using the query function outlined above [see section
-:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
-to check the AS_UNEVICTABLE flag.
-
-For address spaces that are so marked after being populated (as SHM regions
-might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate
-the page tables for the region as does, for example, mlock(), nor need it make
-any special effort to push any pages in the SHM_LOCK'd area to the unevictable
-list.  Instead, vmscan will do this if and when it encounters the pages during
-a reclamation scan.
-
-On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan
-the pages in the region and "rescue" them from the unevictable list if no other
-condition is keeping them unevictable.  If an unevictable region is destroyed,
-the pages are also "rescued" from the unevictable list in the process of
-freeing them.
-
-page_evictable() also checks for mlocked pages by testing an additional page
-flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
-faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED.
-
-
-Vmscan's Handling of Unevictable Pages
---------------------------------------
-
-If unevictable pages are culled in the fault path, or moved to the unevictable
-list at mlock() or mmap() time, vmscan will not encounter the pages until they
-have become evictable again (via munlock() for example) and have been "rescued"
-from the unevictable list.  However, there may be situations where we decide,
-for the sake of expediency, to leave an unevictable page on one of the regular
-active/inactive LRU lists for vmscan to deal with.  vmscan checks for such
-pages in all of the shrink_{active|inactive|page}_list() functions and will
-"cull" such pages that it encounters: that is, it diverts those pages to the
-unevictable list for the memory cgroup and node being scanned.
-
-There may be situations where a page is mapped into a VM_LOCKED VMA, but the
-page is not marked as PG_mlocked.  Such pages will make it all the way to
-shrink_active_list() or shrink_page_list() where they will be detected when
-vmscan walks the reverse map in page_referenced() or try_to_unmap().  The page
-is culled to the unevictable list when it is released by the shrinker.
-
-To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
-using putback_lru_page() - the inverse operation to isolate_lru_page() - after
-dropping the page lock.  Because the condition which makes the page unevictable
-may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the
-unevictable state of a page before placing it on the unevictable list.
-
-
-MLOCKED Pages
-=============
-
-The unevictable page list is also useful for mlock(), in addition to ramfs and
-SYSV SHM.  Note that mlock() is only available in CONFIG_MMU=y situations; in
-NOMMU situations, all mappings are effectively mlocked.
-
-
-History
--------
-
-The "Unevictable mlocked Pages" infrastructure is based on work originally
-posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
-Nick posted his patch as an alternative to a patch posted by Christoph Lameter
-to achieve the same objective: hiding mlocked pages from vmscan.
-
-In Nick's patch, he used one of the struct page LRU list link fields as a count
-of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years
-earlier).  But this use of the link field for a count prevented the management
-of the pages on an LRU list, and thus mlocked pages were not migratable as
-isolate_lru_page() could not detect them, and the LRU list link field was not
-available to the migration subsystem.
-
-Nick resolved this by putting mlocked pages back on the LRU list before
-attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs.  When
-Nick's patch was integrated with the Unevictable LRU work, the count was
-replaced by walking the reverse map when munlocking, to determine whether any
-other VM_LOCKED VMAs still mapped the page.
-
-However, walking the reverse map for each page when munlocking was ugly and
-inefficient, and could lead to catastrophic contention on a file's rmap lock,
-when many processes which had it mlocked were trying to exit.  In 5.18, the
-idea of keeping mlock_count in Unevictable LRU list link field was revived and
-put to work, without preventing the migration of mlocked pages.  This is why
-the "Unevictable LRU list" cannot be a linked list of pages now; but there was
-no use for that linked list anyway - though its size is maintained for meminfo.
-
-
-Basic Management
-----------------
-
-mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
-pages.  When such a page has been "noticed" by the memory management subsystem,
-the page is marked with the PG_mlocked flag.  This can be manipulated using the
-PageMlocked() functions.
-
-A PG_mlocked page will be placed on the unevictable list when it is added to
-the LRU.  Such pages can be "noticed" by memory management in several places:
-
- (1) in the mlock()/mlock2()/mlockall() system call handlers;
-
- (2) in the mmap() system call handler when mmapping a region with the
-     MAP_LOCKED flag;
-
- (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
-     flag;
-
- (4) in the fault path and when a VM_LOCKED stack segment is expanded; or
-
- (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
-     reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
-
-mlocked pages become unlocked and rescued from the unevictable list when:
-
- (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
-
- (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
-     unmapping at task exit;
-
- (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
-     or
-
- (4) before a page is COW'd in a VM_LOCKED VMA.
-
-
-mlock()/mlock2()/mlockall() System Call Handling
-------------------------------------------------
-
-mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup()
-for each VMA in the range specified by the call.  In the case of mlockall(),
-this is the entire active address space of the task.  Note that mlock_fixup()
-is used for both mlocking and munlocking a range of memory.  A call to mlock()
-an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is
-treated as a no-op and mlock_fixup() simply returns.
-
-If the VMA passes some filtering as described in "Filtering Special VMAs"
-below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
-off a subset of the VMA if the range does not cover the entire VMA.  Any pages
-already present in the VMA are then marked as mlocked by mlock_page() via
-mlock_pte_range() via walk_page_range() via mlock_vma_pages_range().
-
-Before returning from the system call, do_mlock() or mlockall() will call
-__mm_populate() to fault in the remaining pages via get_user_pages() and to
-mark those pages as mlocked as they are faulted.
-
-Note that the VMA being mlocked might be mapped with PROT_NONE.  In this case,
-get_user_pages() will be unable to fault in the pages.  That's okay.  If pages
-do end up getting faulted into this VM_LOCKED VMA, they will be handled in the
-fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled.
-
-For each PTE (or PMD) being faulted into a VMA, the page add rmap function
-calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED
-(unless it is a PTE mapping of a part of a transparent huge page).  Or when
-it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable()
-calls mlock_new_page() instead: similar to mlock_page(), but can make better
-judgments, since this page is held exclusively and known not to be on LRU yet.
-
-mlock_page() sets PageMlocked immediately, then places the page on the CPU's
-mlock pagevec, to batch up the rest of the work to be done under lru_lock by
-__mlock_page().  __mlock_page() sets PageUnevictable, initializes mlock_count
-and moves the page to unevictable state ("the unevictable LRU", but with
-mlock_count in place of LRU threading).  Or if the page was already PageLRU
-and PageUnevictable and PageMlocked, it simply increments the mlock_count.
-
-But in practice that may not work ideally: the page may not yet be on an LRU, or
-it may have been temporarily isolated from LRU.  In such cases the mlock_count
-field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn()
-returns the page to "LRU".  Races prohibit mlock_count from being set to 1 then:
-rather than risk stranding a page indefinitely as unevictable, always err with
-mlock_count on the low side, so that when munlocked the page will be rescued to
-an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a
-VM_LOCKED VMA.
-
-
-Filtering Special VMAs
-----------------------
-
-mlock_fixup() filters several classes of "special" VMAs:
-
-1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely.  The pages behind
-   these mappings are inherently pinned, so we don't need to mark them as
-   mlocked.  In any case, most of the pages have no struct page in which to so
-   mark the page.  Because of this, get_user_pages() will fail for these VMAs,
-   so there is no sense in attempting to visit them.
-
-2) VMAs mapping hugetlbfs page are already effectively pinned into memory.  We
-   neither need nor want to mlock() these pages.  But __mm_populate() includes
-   hugetlbfs ranges, allocating the huge pages and populating the PTEs.
-
-3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
-   such as the VDSO page, relay channel pages, etc.  These pages are inherently
-   unevictable and are not managed on the LRU lists.  __mm_populate() includes
-   these ranges, populating the PTEs if not already populated.
-
-4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate()
-   includes these ranges, populating the PTEs if not already populated.
-
-Note that for all of these special VMAs, mlock_fixup() does not set the
-VM_LOCKED flag.  Therefore, we won't have to deal with them later during
-munlock(), munmap() or task exit.  Neither does mlock_fixup() account these
-VMAs against the task's "locked_vm".
-
-
-munlock()/munlockall() System Call Handling
--------------------------------------------
-
-The munlock() and munlockall() system calls are handled by the same
-mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are.
-If called to munlock an already munlocked VMA, mlock_fixup() simply returns.
-Because of the VMA filtering discussed above, VM_LOCKED will not be set in
-any "special" VMAs.  So, those VMAs will be ignored for munlock.
-
-If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
-specified range.  All pages in the VMA are then munlocked by munlock_page() via
-mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same
-function used when mlocking a VMA range, with new flags for the VMA indicating
-that it is munlock() being performed.
-
-munlock_page() uses the mlock pagevec to batch up work to be done under
-lru_lock by  __munlock_page().  __munlock_page() decrements the page's
-mlock_count, and when that reaches 0 it clears PageMlocked and clears
-PageUnevictable, moving the page from unevictable state to inactive LRU.
-
-But in practice that may not work ideally: the page may not yet have reached
-"the unevictable LRU", or it may have been temporarily isolated from it.  In
-those cases its mlock_count field is unusable and must be assumed to be 0: so
-that the page will be rescued to an evictable LRU, then perhaps be mlocked
-again later if vmscan finds it in a VM_LOCKED VMA.
-
-
-Migrating MLOCKED Pages
------------------------
-
-A page that is being migrated has been isolated from the LRU lists and is held
-locked across unmapping of the page, updating the page's address space entry
-and copying the contents and state, until the page table entry has been
-replaced with an entry that refers to the new page.  Linux supports migration
-of mlocked pages and other unevictable pages.  PG_mlocked is cleared from the
-the old page when it is unmapped from the last VM_LOCKED VMA, and set when the
-new page is mapped in place of migration entry in a VM_LOCKED VMA.  If the page
-was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the
-page was unevictable for other reasons, PG_unevictable is copied explicitly.
-
-Note that page migration can race with mlocking or munlocking of the same page.
-There is mostly no problem since page migration requires unmapping all PTEs of
-the old page (including munlock where VM_LOCKED), then mapping in the new page
-(including mlock where VM_LOCKED).  The page table locks provide sufficient
-synchronization.
-
-However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA,
-before mlocking any pages already present, if one of those pages were migrated
-before mlock_pte_range() reached it, it would get counted twice in mlock_count.
-To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO,
-so that mlock_vma_page() will skip it.
-
-To complete page migration, we place the old and new pages back onto the LRU
-afterwards.  The "unneeded" page - old page on success, new page on failure -
-is freed when the reference count held by the migration process is released.
-
-
-Compacting MLOCKED Pages
-------------------------
-
-The memory map can be scanned for compactable regions and the default behavior
-is to let unevictable pages be moved.  /proc/sys/vm/compact_unevictable_allowed
-controls this behavior (see Documentation/admin-guide/sysctl/vm.rst).  The work
-of compaction is mostly handled by the page migration code and the same work
-flow as described in Migrating MLOCKED Pages will apply.
-
-
-MLOCKING Transparent Huge Pages
--------------------------------
-
-A transparent huge page is represented by a single entry on an LRU list.
-Therefore, we can only make unevictable an entire compound page, not
-individual subpages.
-
-If a user tries to mlock() part of a huge page, and no user mlock()s the
-whole of the huge page, we want the rest of the page to be reclaimable.
-
-We cannot just split the page on partial mlock() as split_huge_page() can
-fail and a new intermittent failure mode for the syscall is undesirable.
-
-We handle this by keeping PTE-mlocked huge pages on evictable LRU lists:
-the PMD on the border of a VM_LOCKED VMA will be split into a PTE table.
-
-This way the huge page is accessible for vmscan.  Under memory pressure the
-page will be split, subpages which belong to VM_LOCKED VMAs will be moved
-to the unevictable LRU and the rest can be reclaimed.
-
-/proc/meminfo's Unevictable and Mlocked amounts do not include those parts
-of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs.
-
-
-mmap(MAP_LOCKED) System Call Handling
--------------------------------------
-
-In addition to the mlock(), mlock2() and mlockall() system calls, an application
-can request that a region of memory be mlocked by supplying the MAP_LOCKED flag
-to the mmap() call.  There is one important and subtle difference here, though.
-mmap() + mlock() will fail if the range cannot be faulted in (e.g. because
-mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail.
-The mmaped area will still have properties of the locked area - pages will not
-get swapped out - but major page faults to fault memory in might still happen.
-
-Furthermore, any mmap() call or brk() call that expands the heap by a task
-that has previously called mlockall() with the MCL_FUTURE flag will result
-in the newly mapped memory being mlocked.  Before the unevictable/mlock
-changes, the kernel simply called make_pages_present() to allocate pages
-and populate the page table.
-
-To mlock a range of memory under the unevictable/mlock infrastructure,
-the mmap() handler and task address space expansion functions call
-populate_vma_page_range() specifying the vma and the address range to mlock.
-
-
-munmap()/exit()/exec() System Call Handling
--------------------------------------------
-
-When unmapping an mlocked region of memory, whether by an explicit call to
-munmap() or via an internal unmap from exit() or exec() processing, we must
-munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
-Before the unevictable/mlock changes, mlocking did not mark the pages in any
-way, so unmapping them required no processing.
-
-For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
-munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
-(unless it was a PTE mapping of a part of a transparent huge page).
-
-munlock_page() uses the mlock pagevec to batch up work to be done under
-lru_lock by  __munlock_page().  __munlock_page() decrements the page's
-mlock_count, and when that reaches 0 it clears PageMlocked and clears
-PageUnevictable, moving the page from unevictable state to inactive LRU.
-
-But in practice that may not work ideally: the page may not yet have reached
-"the unevictable LRU", or it may have been temporarily isolated from it.  In
-those cases its mlock_count field is unusable and must be assumed to be 0: so
-that the page will be rescued to an evictable LRU, then perhaps be mlocked
-again later if vmscan finds it in a VM_LOCKED VMA.
-
-
-Truncating MLOCKED Pages
-------------------------
-
-File truncation or hole punching forcibly unmaps the deleted pages from
-userspace; truncation even unmaps and deletes any private anonymous pages
-which had been Copied-On-Write from the file pages now being truncated.
-
-Mlocked pages can be munlocked and deleted in this way: like with munmap(),
-for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
-munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
-(unless it was a PTE mapping of a part of a transparent huge page).
-
-However, if there is a racing munlock(), since mlock_vma_pages_range() starts
-munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages
-present, if one of those pages were unmapped by truncation or hole punch before
-mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA,
-and would not be counted out of mlock_count.  In this rare case, a page may
-still appear as PageMlocked after it has been fully unmapped: and it is left to
-release_pages() (or __page_cache_release()) to clear it and update statistics
-before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared,
-which is usually 0).
-
-
-Page Reclaim in shrink_*_list()
--------------------------------
-
-vmscan's shrink_active_list() culls any obviously unevictable pages -
-i.e. !page_evictable(page) pages - diverting those to the unevictable list.
-However, shrink_active_list() only sees unevictable pages that made it onto the
-active/inactive LRU lists.  Note that these pages do not have PageUnevictable
-set - otherwise they would be on the unevictable list and shrink_active_list()
-would never see them.
-
-Some examples of these unevictable pages on the LRU lists are:
-
- (1) ramfs pages that have been placed on the LRU lists when first allocated.
-
- (2) SHM_LOCK'd shared memory pages.  shmctl(SHM_LOCK) does not attempt to
-     allocate or fault in the pages in the shared memory region.  This happens
-     when an application accesses the page the first time after SHM_LOCK'ing
-     the segment.
-
- (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked,
-     but events left mlock_count too low, so they were munlocked too early.
-
-vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
-unevictable pages found on the inactive lists to the appropriate memory cgroup
-and node unevictable list.
-
-rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
-shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
-check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
-to correct them.  Such pages are culled to the unevictable list when released
-by the shrinker.
diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst
deleted file mode 100644
index 363fe20d6b9f..000000000000
--- a/Documentation/vm/vmalloc.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-======================================
-Virtually Contiguous Memory Allocation
-======================================
diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst
deleted file mode 100644
index fc8c67833af6..000000000000
--- a/Documentation/vm/vmalloced-kernel-stacks.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=====================================
-Virtually Mapped Kernel Stack Support
-=====================================
-
-:Author: Shuah Khan <skhan@linuxfoundation.org>
-
-.. contents:: :local:
-
-Overview
---------
-
-This is a compilation of information from the code and original patch
-series that introduced the `Virtually Mapped Kernel Stacks feature
-<https://lwn.net/Articles/694348/>`
-
-Introduction
-------------
-
-Kernel stack overflows are often hard to debug and make the kernel
-susceptible to exploits. Problems could show up at a later time making
-it difficult to isolate and root-cause.
-
-Virtually-mapped kernel stacks with guard pages causes kernel stack
-overflows to be caught immediately rather than causing difficult to
-diagnose corruptions.
-
-HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
-support for virtually mapped stacks with guard pages. This feature
-causes reliable faults when the stack overflows. The usability of
-the stack trace after overflow and response to the overflow itself
-is architecture dependent.
-
-.. note::
-        As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
-        support for VMAP_STACK.
-
-HAVE_ARCH_VMAP_STACK
---------------------
-
-Architectures that can support Virtually Mapped Kernel Stacks should
-enable this bool configuration option. The requirements are:
-
-- vmalloc space must be large enough to hold many kernel stacks. This
-  may rule out many 32-bit architectures.
-- Stacks in vmalloc space need to work reliably.  For example, if
-  vmap page tables are created on demand, either this mechanism
-  needs to work while the stack points to a virtual address with
-  unpopulated page tables or arch code (switch_to() and switch_mm(),
-  most likely) needs to ensure that the stack's page table entries
-  are populated before running on a possibly unpopulated stack.
-- If the stack overflows into a guard page, something reasonable
-  should happen. The definition of "reasonable" is flexible, but
-  instantly rebooting without logging anything would be unfriendly.
-
-VMAP_STACK
-----------
-
-VMAP_STACK bool configuration option when enabled allocates virtually
-mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
-
-- Enable this if you want the use virtually-mapped kernel stacks
-  with guard pages. This causes kernel stack overflows to be caught
-  immediately rather than causing difficult-to-diagnose corruption.
-
-.. note::
-
-        Using this feature with KASAN requires architecture support
-        for backing virtual mappings with real shadow memory, and
-        KASAN_VMALLOC must be enabled.
-
-.. note::
-
-        VMAP_STACK is enabled, it is not possible to run DMA on stack
-        allocated data.
-
-Kernel configuration options and dependencies keep changing. Refer to
-the latest code base:
-
-`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
-
-Allocation
------------
-
-When a new kernel thread is created, thread stack is allocated from
-virtually contiguous memory pages from the page level allocator. These
-pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
-protections.
-
-alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
-with PAGE_KERNEL protections.
-
-- Allocated stacks are cached and later reused by new threads, so memcg
-  accounting is performed manually on assigning/releasing stacks to tasks.
-  Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
-- vm_struct is cached to be able to find when thread free is initiated
-  in interrupt context. free_thread_stack() can be called in interrupt
-  context.
-- On arm64, all VMAP's stacks need to have the same alignment to ensure
-  that VMAP'd stack overflow detection works correctly. Arch specific
-  vmap stack allocator takes care of this detail.
-- This does not address interrupt stacks - according to the original patch
-
-Thread stack allocation is initiated from clone(), fork(), vfork(),
-kernel_thread() via kernel_clone(). Leaving a few hints for searching
-the code base to understand when and how thread stack is allocated.
-
-Bulk of the code is in:
-`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
-
-stack_vm_area pointer in task_struct keeps track of the virtually allocated
-stack and a non-null stack_vm_area pointer serves as a indication that the
-virtually mapped kernel stacks are enabled.
-
-::
-
-        struct vm_struct *stack_vm_area;
-
-Stack overflow handling
------------------------
-
-Leading and trailing guard pages help detect stack overflows. When stack
-overflows into the guard pages, handlers have to be careful not overflow
-the stack again. When handlers are called, it is likely that very little
-stack space is left.
-
-On x86, this is done by handling the page fault indicating the kernel
-stack overflow on the double-fault stack.
-
-Testing VMAP allocation with guard pages
-----------------------------------------
-
-How do we ensure that VMAP_STACK is actually allocating with a leading
-and trailing guard page? The following lkdtm tests can help detect any
-regressions.
-
-::
-
-        void lkdtm_STACK_GUARD_PAGE_LEADING()
-        void lkdtm_STACK_GUARD_PAGE_TRAILING()
-
-Conclusions
------------
-
-- A percpu cache of vmalloced stacks appears to be a bit faster than a
-  high-order stack allocation, at least when the cache hits.
-- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
-  simply embed the thread_info (containing only flags) and 'int cpu' into
-  task_struct.
-- The thread stack can be free'ed as soon as the task is dead (without
-  waiting for RCU) and then, if vmapped stacks are in use, cache the
-  entire stack for reuse on the same cpu.
diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/vm/vmemmap_dedup.rst
deleted file mode 100644
index c9c495f62d12..000000000000
--- a/Documentation/vm/vmemmap_dedup.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=========================================
-A vmemmap diet for HugeTLB and Device DAX
-=========================================
-
-HugeTLB
-=======
-
-The struct page structures (page structs) are used to describe a physical
-page frame. By default, there is a one-to-one mapping from a page frame to
-it's corresponding page struct.
-
-HugeTLB pages consist of multiple base page size pages and is supported by many
-architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
-details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
-currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
-consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
-For each base page, there is a corresponding page struct.
-
-Within the HugeTLB subsystem, only the first 4 page structs are used to
-contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
-this upper limit. The only 'useful' information in the remaining page structs
-is the compound_head field, and this field is the same for all tail pages.
-
-By removing redundant page structs for HugeTLB pages, memory can be returned
-to the buddy allocator for other uses.
-
-Different architectures support different HugeTLB pages. For example, the
-following table is the HugeTLB page size supported by x86 and arm64
-architectures. Because arm64 supports 4k, 16k, and 64k base pages and
-supports contiguous entries, so it supports many kinds of sizes of HugeTLB
-page.
-
-+--------------+-----------+-----------------------------------------------+
-| Architecture | Page Size |                HugeTLB Page Size              |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-|    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-|              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
-|              +-----------+-----------+-----------+-----------+-----------+
-|    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
-|              +-----------+-----------+-----------+-----------+-----------+
-|              |   64KB    |    2MB    |  512MB    |    16GB   |           |
-+--------------+-----------+-----------+-----------+-----------+-----------+
-
-When the system boot up, every HugeTLB page has more than one struct page
-structs which size is (unit: pages)::
-
-   struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
-
-Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
-of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
-relationship::
-
-   HugeTLB_Size = n * PAGE_SIZE
-
-Then::
-
-   struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
-               = n * sizeof(struct page) / PAGE_SIZE
-
-We can use huge mapping at the pud/pmd level for the HugeTLB page.
-
-For the HugeTLB page of the pmd level mapping, then::
-
-   struct_size = n * sizeof(struct page) / PAGE_SIZE
-               = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
-               = sizeof(struct page) / sizeof(pte_t)
-               = 64 / 8
-               = 8 (pages)
-
-Where n is how many pte entries which one page can contains. So the value of
-n is (PAGE_SIZE / sizeof(pte_t)).
-
-This optimization only supports 64-bit system, so the value of sizeof(pte_t)
-is 8. And this optimization also applicable only when the size of struct page
-is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
-x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
-size of struct page structs of it is 8 page frames which size depends on the
-size of the base page.
-
-For the HugeTLB page of the pud level mapping, then::
-
-   struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
-               = PAGE_SIZE / 8 * 8 (pages)
-               = PAGE_SIZE (pages)
-
-Where the struct_size(pmd) is the size of the struct page structs of a
-HugeTLB page of the pmd level mapping.
-
-E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
-HugeTLB page consists in 4096.
-
-Next, we take the pmd level mapping of the HugeTLB page as an example to
-show the internal implementation of this optimization. There are 8 pages
-struct page structs associated with a HugeTLB page which is pmd mapped.
-
-Here is how things look before optimization::
-
-    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | -------------> |     1     |
- |           |                     +-----------+                +-----------+
- |           |                     |     2     | -------------> |     2     |
- |           |                     +-----------+                +-----------+
- |           |                     |     3     | -------------> |     3     |
- |           |                     +-----------+                +-----------+
- |           |                     |     4     | -------------> |     4     |
- |    PMD    |                     +-----------+                +-----------+
- |   level   |                     |     5     | -------------> |     5     |
- |  mapping  |                     +-----------+                +-----------+
- |           |                     |     6     | -------------> |     6     |
- |           |                     +-----------+                +-----------+
- |           |                     |     7     | -------------> |     7     |
- |           |                     +-----------+                +-----------+
- |           |
- |           |
- |           |
- +-----------+
-
-The value of page->compound_head is the same for all tail pages. The first
-page of page structs (page 0) associated with the HugeTLB page contains the 4
-page structs necessary to describe the HugeTLB. The only use of the remaining
-pages of page structs (page 1 to page 7) is to point to page->compound_head.
-Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
-will be used for each HugeTLB page. This will allow us to free the remaining
-7 pages to the buddy allocator.
-
-Here is how things look after remapping::
-
-    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | ---------------^ ^ ^ ^ ^ ^ ^
- |           |                     +-----------+                  | | | | | |
- |           |                     |     2     | -----------------+ | | | | |
- |           |                     +-----------+                    | | | | |
- |           |                     |     3     | -------------------+ | | | |
- |           |                     +-----------+                      | | | |
- |           |                     |     4     | ---------------------+ | | |
- |    PMD    |                     +-----------+                        | | |
- |   level   |                     |     5     | -----------------------+ | |
- |  mapping  |                     +-----------+                          | |
- |           |                     |     6     | -------------------------+ |
- |           |                     +-----------+                            |
- |           |                     |     7     | ---------------------------+
- |           |                     +-----------+
- |           |
- |           |
- |           |
- +-----------+
-
-When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
-vmemmap pages and restore the previous mapping relationship.
-
-For the HugeTLB page of the pud level mapping. It is similar to the former.
-We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
-
-Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
-(e.g. aarch64) provides a contiguous bit in the translation table entries
-that hints to the MMU to indicate that it is one of a contiguous set of
-entries that can be cached in a single TLB entry.
-
-The contiguous bit is used to increase the mapping size at the pmd and pte
-(last) level. So this type of HugeTLB page can be optimized only when its
-size of the struct page structs is greater than 1 page.
-
-Notice: The head vmemmap page is not freed to the buddy allocator and all
-tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
-more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
-associated with each HugeTLB page. The compound_head() can handle this
-correctly (more details refer to the comment above compound_head()).
-
-Device DAX
-==========
-
-The device-dax interface uses the same tail deduplication technique explained
-in the previous chapter, except when used with the vmemmap in
-the device (altmap).
-
-The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
-PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
-
-The differences with HugeTLB are relatively minor.
-
-It only use 3 page structs for storing all information as opposed
-to 4 on HugeTLB pages.
-
-There's no remapping of vmemmap given that device-dax memory is not part of
-System RAM ranges initialized at boot. Thus the tail page deduplication
-happens at a later stage when we populate the sections. HugeTLB reuses the
-the head vmemmap page representing, whereas device-dax reuses the tail
-vmemmap page. This results in only half of the savings compared to HugeTLB.
-
-Deduplicated tail pages are not mapped read-only.
-
-Here's how things look like on device-dax after the sections are populated::
-
- +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
- |           |                     |     0     | -------------> |     0     |
- |           |                     +-----------+                +-----------+
- |           |                     |     1     | -------------> |     1     |
- |           |                     +-----------+                +-----------+
- |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
- |           |                     +-----------+                   | | | | |
- |           |                     |     3     | ------------------+ | | | |
- |           |                     +-----------+                     | | | |
- |           |                     |     4     | --------------------+ | | |
- |    PMD    |                     +-----------+                       | | |
- |   level   |                     |     5     | ----------------------+ | |
- |  mapping  |                     +-----------+                         | |
- |           |                     |     6     | ------------------------+ |
- |           |                     +-----------+                           |
- |           |                     |     7     | --------------------------+
- |           |                     +-----------+
- |           |
- |           |
- |           |
- +-----------+
diff --git a/Documentation/vm/z3fold.rst b/Documentation/vm/z3fold.rst
deleted file mode 100644
index 224e3c61d686..000000000000
--- a/Documentation/vm/z3fold.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. _z3fold:
-
-======
-z3fold
-======
-
-z3fold is a special purpose allocator for storing compressed pages.
-It is designed to store up to three compressed pages per physical page.
-It is a zbud derivative which allows for higher compression
-ratio keeping the simplicity and determinism of its predecessor.
-
-The main differences between z3fold and zbud are:
-
-* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
-* z3fold can hold up to 3 compressed pages in its page
-* z3fold doesn't export any API itself and is thus intended to be used
-  via the zpool API.
-
-To keep the determinism and simplicity, z3fold, just like zbud, always
-stores an integral number of compressed pages per page, but it can store
-up to 3 pages unlike zbud which can store at most 2. Therefore the
-compression ratio goes to around 2.7x while zbud's one is around 1.7x.
-
-Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
-return a dereferenceable pointer. Instead, it returns an unsigned long
-handle which encodes actual location of the allocated object.
-
-Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
-depend on MMU enabled and provides more predictable reclaim behavior
-which makes it a better fit for small and response-critical systems.
diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/vm/zsmalloc.rst
deleted file mode 100644
index 6e79893d6132..000000000000
--- a/Documentation/vm/zsmalloc.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. _zsmalloc:
-
-========
-zsmalloc
-========
-
-This allocator is designed for use with zram. Thus, the allocator is
-supposed to work well under low memory conditions. In particular, it
-never attempts higher order page allocation which is very likely to
-fail under memory pressure. On the other hand, if we just use single
-(0-order) pages, it would suffer from very high fragmentation --
-any object of size PAGE_SIZE/2 or larger would occupy an entire page.
-This was one of the major issues with its predecessor (xvmalloc).
-
-To overcome these issues, zsmalloc allocates a bunch of 0-order pages
-and links them together using various 'struct page' fields. These linked
-pages act as a single higher-order page i.e. an object can span 0-order
-page boundaries. The code refers to these linked pages as a single entity
-called zspage.
-
-For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
-since this satisfies the requirements of all its current users (in the
-worst case, page is incompressible and is thus stored "as-is" i.e. in
-uncompressed form). For allocation requests larger than this size, failure
-is returned (see zs_malloc).
-
-Additionally, zs_malloc() does not return a dereferenceable pointer.
-Instead, it returns an opaque handle (unsigned long) which encodes actual
-location of the allocated object. The reason for this indirection is that
-zsmalloc does not keep zspages permanently mapped since that would cause
-issues on 32-bit systems where the VA region for kernel space mappings
-is very small. So, before using the allocating memory, the object has to
-be mapped using zs_map_object() to get a usable pointer and subsequently
-unmapped using zs_unmap_object().
-
-stat
-====
-
-With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
-``/sys/kernel/debug/zsmalloc/<user name>``. Here is a sample of stat output::
-
- # cat /sys/kernel/debug/zsmalloc/zram0/classes
-
- class  size almost_full almost_empty obj_allocated   obj_used pages_used pages_per_zspage
-    ...
-    ...
-     9   176           0            1           186        129          8                4
-    10   192           1            0          2880       2872        135                3
-    11   208           0            1           819        795         42                2
-    12   224           0            1           219        159         12                4
-    ...
-    ...
-
-
-class
-	index
-size
-	object size zspage stores
-almost_empty
-	the number of ZS_ALMOST_EMPTY zspages(see below)
-almost_full
-	the number of ZS_ALMOST_FULL zspages(see below)
-obj_allocated
-	the number of objects allocated
-obj_used
-	the number of objects allocated to the user
-pages_used
-	the number of pages allocated for the class
-pages_per_zspage
-	the number of 0-order pages to make a zspage
-
-We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
-
-* n = number of allocated objects
-* N = total number of objects zspage can store
-* f = fullness_threshold_frac(ie, 4 at the moment)
-
-Similarly, we assign zspage to:
-
-* ZS_ALMOST_FULL  when n > N / f
-* ZS_EMPTY        when n == 0
-* ZS_FULL         when n == N
diff --git a/MAINTAINERS b/MAINTAINERS
index fe5daf141501..55fb1daa9057 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5526,7 +5526,7 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-kernel-mm-damon
 F:	Documentation/admin-guide/mm/damon/
-F:	Documentation/vm/damon/
+F:	Documentation/mm/damon/
 F:	include/linux/damon.h
 F:	include/trace/events/damon.h
 F:	mm/damon/
@@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management
 M:	Jérôme Glisse <jglisse@redhat.com>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/hmm.rst
+F:	Documentation/mm/hmm.rst
 F:	include/linux/hmm*
 F:	lib/test_hmm*
 F:	mm/hmm*
@@ -9135,8 +9135,8 @@ L:	linux-mm@kvack.org
 S:	Maintained
 F:	Documentation/ABI/testing/sysfs-kernel-mm-hugepages
 F:	Documentation/admin-guide/mm/hugetlbpage.rst
-F:	Documentation/vm/hugetlbfs_reserv.rst
-F:	Documentation/vm/vmemmap_dedup.rst
+F:	Documentation/mm/hugetlbfs_reserv.rst
+F:	Documentation/mm/vmemmap_dedup.rst
 F:	fs/hugetlbfs/
 F:	include/linux/hugetlb.h
 F:	mm/hugetlb.c
@@ -15072,7 +15072,7 @@ M:	Pasha Tatashin <pasha.tatashin@soleen.com>
 M:	Andrew Morton <akpm@linux-foundation.org>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/page_table_check.rst
+F:	Documentation/mm/page_table_check.rst
 F:	include/linux/page_table_check.h
 F:	mm/page_table_check.c
 
@@ -22158,7 +22158,7 @@ M:	Nitin Gupta <ngupta@vflare.org>
 R:	Sergey Senozhatsky <senozhatsky@chromium.org>
 L:	linux-mm@kvack.org
 S:	Maintained
-F:	Documentation/vm/zsmalloc.rst
+F:	Documentation/mm/zsmalloc.rst
 F:	include/linux/zsmalloc.h
 F:	mm/zsmalloc.c
 
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1920d52653b4..db2838cf8c02 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE
 	  Say Y to support efficient handling of sparse physical memory,
 	  for architectures which are either NUMA (Non-Uniform Memory Access)
 	  or have huge holes in the physical address space for other reasons.
-	  See <file:Documentation/vm/numa.rst> for more.
+	  See <file:Documentation/mm/numa.rst> for more.
 
 config ARCH_ENABLE_THP_MIGRATION
 	def_bool y
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb9d5fd39d7f..392ff48f77df 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
  * should return true.
  * We should not call this on a hugetlb entry. We should check for HugeTLB
  * entry using vma->vm_flags
- * The page table walk rule is explained in Documentation/vm/transhuge.rst
+ * The page table walk rule is explained in Documentation/mm/transhuge.rst
  */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index d5a6f101f843..126a36571667 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -4,7 +4,7 @@
  *
  * Authors: Jérôme Glisse <jglisse@redhat.com>
  *
- * See Documentation/vm/hmm.rst for reasons and overview of what HMM is.
+ * See Documentation/mm/hmm.rst for reasons and overview of what HMM is.
  */
 #ifndef LINUX_HMM_H
 #define LINUX_HMM_H
@@ -100,7 +100,7 @@ struct hmm_range {
 };
 
 /*
- * Please see Documentation/vm/hmm.rst for how to use the range API.
+ * Please see Documentation/mm/hmm.rst for how to use the range API.
  */
 int hmm_range_fault(struct hmm_range *range);
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8af304f6b504..9f5ee49482de 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -39,7 +39,7 @@ struct vmem_altmap {
  * must be treated as an opaque object, rather than a "normal" struct page.
  *
  * A more complete discussion of unaddressable memory may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst.
+ * include/linux/hmm.h and Documentation/mm/hmm.rst.
  *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 45fc2c81e370..d6c06e140277 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -198,7 +198,7 @@ struct mmu_notifier_ops {
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() already catches the points in time when an
 	 * external TLB range needs to be flushed. For more in depth
-	 * discussion on this see Documentation/vm/mmu_notifier.rst
+	 * discussion on this see Documentation/mm/mmu_notifier.rst
 	 *
 	 * Note that this function might be called with just a sub-range
 	 * of what was passed to invalidate_range_start()/end(), if
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8cd975a8bfeb..2a243616f222 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void);
  *
  * Use mmdrop() to release the reference acquired by mmgrab().
  *
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmgrab(struct mm_struct *mm)
@@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm)
  *
  * Use mmput() to release the reference acquired by mmget().
  *
- * See also <Documentation/vm/active_mm.rst> for an in-depth explanation
+ * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
  * of &mm_struct.mm_count vs &mm_struct.mm_users.
  */
 static inline void mmget(struct mm_struct *mm)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0c0fed1b348f..95a5b7aa1ae9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -74,7 +74,7 @@ static inline int current_is_kswapd(void)
 
 /*
  * Unaddressable device memory support. See include/linux/hmm.h and
- * Documentation/vm/hmm.rst. Short description is we need struct pages for
+ * Documentation/mm/hmm.rst. Short description is we need struct pages for
  * device memory that is unaddressable (inaccessible) by CPU, so that we can
  * migrate part of a process memory to device memory.
  *
diff --git a/mm/Kconfig b/mm/Kconfig
index 169e64192e48..c1fa4993a56f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -663,7 +663,7 @@ config KSM
 	  the many instances by a single page with that content, so
 	  saving memory until one or another app needs to modify the content.
 	  Recommended for use with KVM, or with other duplicative applications.
-	  See Documentation/vm/ksm.rst for more information: KSM is inactive
+	  See Documentation/mm/ksm.rst for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1ab091f49fc0..dc7df1254f0a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -35,7 +35,7 @@
 #include <asm/tlbflush.h>
 
 /*
- * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
  * expectations that are being validated here. All future changes in here
  * or the documentation need to be in sync.
  */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6f69b044a8cc..1a97610308cb 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -4,7 +4,7 @@
  *
  * This code provides the generic "frontend" layer to call a matching
  * "backend" driver implementation of frontswap.  See
- * Documentation/vm/frontswap.rst for more information.
+ * Documentation/mm/frontswap.rst for more information.
  *
  * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  * Author: Dan Magenheimer
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..f9b90a8d7dfa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	 * replacing a zero pmd write protected page with a zero pte write
 	 * protected page.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	pmdp_huge_clear_flush(vma, haddr, pmd);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..b36a4ef87a2e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4875,7 +4875,7 @@ again:
 				 * table protection not changing it to point
 				 * to a new page.
 				 *
-				 * See Documentation/vm/mmu_notifier.rst
+				 * See Documentation/mm/mmu_notifier.rst
 				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);
 				entry = huge_pte_wrprotect(entry);
@@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
 	 * page table protection not changing it to point to a new page.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(&range);
@@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	/*
 	 * No need to call mmu_notifier_invalidate_range(), see
-	 * Documentation/vm/mmu_notifier.rst.
+	 * Documentation/mm/mmu_notifier.rst.
 	 */
 	mmu_notifier_invalidate_range_end(&range);
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1089ea8a9c98..ba29c15c53d6 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -6,7 +6,7 @@
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  *
- * See Documentation/vm/vmemmap_dedup.rst
+ * See Documentation/mm/vmemmap_dedup.rst
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..8d2dc501c92c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * No need to notify as we are downgrading page table to read
 		 * only not changing it to point to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
@@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 * No need to notify as we are replacing a read only page with another
 	 * read only page with the same content.
 	 *
-	 * See Documentation/vm/mmu_notifier.rst
+	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
diff --git a/mm/mmap.c b/mm/mmap.c
index 61e6135c54ef..c14d7286a379 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	unsigned long ret = -EINVAL;
 	struct file *file;
 
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
 		     current->comm, current->pid);
 
 	if (prot)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5bcb334cd6f2..65e0a767b837 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 		 * downgrading page table protection not changing it to point
 		 * to a new page.
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		if (ret)
 			cleaned++;
@@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * to point at a new folio while a device is
 			 * still using this folio.
 			 *
-			 * See Documentation/vm/mmu_notifier.rst
+			 * See Documentation/mm/mmu_notifier.rst
 			 */
 			dec_mm_counter(mm, mm_counter_file(&folio->page));
 		}
@@ -1775,7 +1775,7 @@ discard:
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
 		if (vma->vm_flags & VM_LOCKED)
@@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		 * done above for all cases requiring it to happen under page
 		 * table lock before mmu_notifier_invalidate_range_end()
 		 *
-		 * See Documentation/vm/mmu_notifier.rst
+		 * See Documentation/mm/mmu_notifier.rst
 		 */
 		page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
 		if (vma->vm_flags & VM_LOCKED)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 652f11a05749..3ff88a2eefb8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 
 		/*
 		 * Reuse the previous page for the rest of tail pages
-		 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
 		 */
 		next += PAGE_SIZE;
 		rc = vmemmap_populate_range(next, last, node, NULL,
diff --git a/mm/util.c b/mm/util.c
index 0837570c9225..5df8f2db7ca9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  * succeed and -ENOMEM implies there is not.
  *
  * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting.rst
+ * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
  *
  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  * Additional code 2002 Jul 20 by Robert Love.
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index c149427eb1c9..74c3dcecf64d 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -8,7 +8,7 @@
  * Or sort by total memory:
  * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
  *
- * See Documentation/vm/page_owner.rst
+ * See Documentation/mm/page_owner.rst
 */
 
 #include <stdio.h>
-- 
cgit v1.2.3


From 586b3b7600e44c1cad52c683ccfbb76fb2c10cc8 Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Date: Fri, 17 Jun 2022 16:16:56 +0530
Subject: firmware: xilinx: Add configuration values for tri-state

Add configuration values(enable/disable) for tri-state parameter.

Signed-off-by: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Link: https://lore.kernel.org/r/1655462819-28801-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/firmware/xlnx-zynqmp.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 1ec73d5352c3..4901fb31de3e 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -368,6 +368,11 @@ enum pm_pinctrl_drive_strength {
 	PM_PINCTRL_DRIVE_STRENGTH_12MA = 3,
 };
 
+enum pm_pinctrl_tri_state {
+	PM_PINCTRL_TRI_STATE_DISABLE = 0,
+	PM_PINCTRL_TRI_STATE_ENABLE = 1,
+};
+
 enum zynqmp_pm_shutdown_type {
 	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0,
 	ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1,
-- 
cgit v1.2.3


From 72a43046b61a3fe7164a622224bcdfc3cf6b795d Mon Sep 17 00:00:00 2001
From: Chanho Park <chanho61.park@samsung.com>
Date: Wed, 29 Jun 2022 09:41:41 +0900
Subject: tty: serial: samsung_tty: loopback mode support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Internal loopback mode can be supported by setting
UCON register's Loopback Mode bit. The mode & bit can be supported
since s3c2410 and later SoCs. The prefix of LOOPBACK / BIT(5) naming
should be also changed to S3C2410_ in order to avoid confusion.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Chanho Park <chanho61.park@samsung.com>
Link: https://lore.kernel.org/r/20220629004141.51484-1-chanho61.park@samsung.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/samsung_tty.c | 8 ++++++++
 include/linux/serial_s3c.h       | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c
index 3e0aa2923605..8971fbb49fa3 100644
--- a/drivers/tty/serial/samsung_tty.c
+++ b/drivers/tty/serial/samsung_tty.c
@@ -1018,6 +1018,7 @@ static unsigned int s3c24xx_serial_get_mctrl(struct uart_port *port)
 static void s3c24xx_serial_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
 	unsigned int umcon = rd_regl(port, S3C2410_UMCON);
+	unsigned int ucon = rd_reg(port, S3C2410_UCON);
 
 	if (mctrl & TIOCM_RTS)
 		umcon |= S3C2410_UMCOM_RTS_LOW;
@@ -1025,6 +1026,13 @@ static void s3c24xx_serial_set_mctrl(struct uart_port *port, unsigned int mctrl)
 		umcon &= ~S3C2410_UMCOM_RTS_LOW;
 
 	wr_regl(port, S3C2410_UMCON, umcon);
+
+	if (mctrl & TIOCM_LOOP)
+		ucon |= S3C2410_UCON_LOOPBACK;
+	else
+		ucon &= ~S3C2410_UCON_LOOPBACK;
+
+	wr_regl(port, S3C2410_UCON, ucon);
 }
 
 static void s3c24xx_serial_break_ctl(struct uart_port *port, int break_state)
diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index dec15f5b3dec..1672cf0810ef 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -83,7 +83,7 @@
 #define S3C2410_UCON_RXIRQMODE	  (1<<0)
 #define S3C2410_UCON_RXFIFO_TOI	  (1<<7)
 #define S3C2443_UCON_RXERR_IRQEN  (1<<6)
-#define S3C2443_UCON_LOOPBACK	  (1<<5)
+#define S3C2410_UCON_LOOPBACK	  (1<<5)
 
 #define S3C2410_UCON_DEFAULT	  (S3C2410_UCON_TXILEVEL  | \
 				   S3C2410_UCON_RXILEVEL  | \
-- 
cgit v1.2.3


From 6e97eba8ad8748fabb795cffc5d9e1a7dcfd7367 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Tue, 28 Jun 2022 18:59:10 +0300
Subject: vfio: Split migration ops from main device ops

vfio core checks whether the driver sets some migration op (e.g.
set_state/get_state) and accordingly calls its op.

However, currently mlx5 driver sets the above ops without regards to its
migration caps.

This might lead to unexpected usage/Oops if user space may call to the
above ops even if the driver doesn't support migration. As for example,
the migration state_mutex is not initialized in that case.

The cleanest way to manage that seems to split the migration ops from
the main device ops, this will let the driver setting them separately
from the main ops when it's applicable.

As part of that, validate ops construction on registration and include a
check for VFIO_MIGRATION_STOP_COPY since the uAPI claims it must be set
in migration_flags.

HISI driver was changed as well to match this scheme.

This scheme may enable down the road to come with some extra group of
ops (e.g. DMA log) that can be set without regards to the other options
based on driver caps.

Fixes: 6fadb021266d ("vfio/mlx5: Implement vfio_pci driver for mlx5 devices")
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://lore.kernel.org/r/20220628155910.171454-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 11 +++++++---
 drivers/vfio/pci/mlx5/cmd.c                    |  4 +++-
 drivers/vfio/pci/mlx5/cmd.h                    |  3 ++-
 drivers/vfio/pci/mlx5/main.c                   |  9 +++++---
 drivers/vfio/pci/vfio_pci_core.c               |  7 ++++++
 drivers/vfio/vfio.c                            | 11 +++++-----
 include/linux/vfio.h                           | 30 +++++++++++++++++---------
 7 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 4def43f5f7b6..ea762e28c1cc 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1185,7 +1185,7 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
 	if (ret)
 		return ret;
 
-	if (core_vdev->ops->migration_set_state) {
+	if (core_vdev->mig_ops) {
 		ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
 		if (ret) {
 			vfio_pci_core_disable(vdev);
@@ -1208,6 +1208,11 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
+static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
+	.migration_set_state = hisi_acc_vfio_pci_set_device_state,
+	.migration_get_state = hisi_acc_vfio_pci_get_device_state,
+};
+
 static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
 	.name = "hisi-acc-vfio-pci-migration",
 	.open_device = hisi_acc_vfio_pci_open_device,
@@ -1219,8 +1224,6 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
 	.mmap = hisi_acc_vfio_pci_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
-	.migration_set_state = hisi_acc_vfio_pci_set_device_state,
-	.migration_get_state = hisi_acc_vfio_pci_get_device_state,
 };
 
 static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
@@ -1272,6 +1275,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device
 		if (!ret) {
 			vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
 						  &hisi_acc_vfio_pci_migrn_ops);
+			hisi_acc_vdev->core_device.vdev.mig_ops =
+					&hisi_acc_vfio_pci_migrn_state_ops;
 		} else {
 			pci_warn(pdev, "migration support failed, continue with generic interface\n");
 			vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev,
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index cdd0c667dc77..dd5d7bfe0a49 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,7 +108,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
 	destroy_workqueue(mvdev->cb_wq);
 }
 
-void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev)
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops)
 {
 	struct pci_dev *pdev = mvdev->core_device.pdev;
 	int ret;
@@ -149,6 +150,7 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev)
 	mvdev->core_device.vdev.migration_flags =
 		VFIO_MIGRATION_STOP_COPY |
 		VFIO_MIGRATION_P2P;
+	mvdev->core_device.vdev.mig_ops = mig_ops;
 
 end:
 	mlx5_vf_put_core_dev(mvdev->mdev);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index aa692d9ce656..8208f4701a90 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -62,7 +62,8 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 					  size_t *state_size);
-void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+			       const struct vfio_migration_ops *mig_ops);
 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index d754990f0662..a9b63d15c5d3 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -574,6 +574,11 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
+static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
+	.migration_set_state = mlx5vf_pci_set_device_state,
+	.migration_get_state = mlx5vf_pci_get_device_state,
+};
+
 static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.name = "mlx5-vfio-pci",
 	.open_device = mlx5vf_pci_open_device,
@@ -585,8 +590,6 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
-	.migration_set_state = mlx5vf_pci_set_device_state,
-	.migration_get_state = mlx5vf_pci_get_device_state,
 };
 
 static int mlx5vf_pci_probe(struct pci_dev *pdev,
@@ -599,7 +602,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev,
 	if (!mvdev)
 		return -ENOMEM;
 	vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops);
-	mlx5vf_cmd_set_migratable(mvdev);
+	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops);
 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
 	ret = vfio_pci_core_register_device(&mvdev->core_device);
 	if (ret)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a0d69ddaf90d..2efa06b1fafa 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1855,6 +1855,13 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 		return -EINVAL;
 
+	if (vdev->vdev.mig_ops) {
+		if (!(vdev->vdev.mig_ops->migration_get_state &&
+		      vdev->vdev.mig_ops->migration_set_state) ||
+		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
+			return -EINVAL;
+	}
+
 	/*
 	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
 	 * by the host or other users.  We cannot capture the VFs if they
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 61e71c1154be..aac9213a783d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1541,8 +1541,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	struct file *filp = NULL;
 	int ret;
 
-	if (!device->ops->migration_set_state ||
-	    !device->ops->migration_get_state)
+	if (!device->mig_ops)
 		return -ENOTTY;
 
 	ret = vfio_check_feature(flags, argsz,
@@ -1558,7 +1557,8 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	if (flags & VFIO_DEVICE_FEATURE_GET) {
 		enum vfio_device_mig_state curr_state;
 
-		ret = device->ops->migration_get_state(device, &curr_state);
+		ret = device->mig_ops->migration_get_state(device,
+							   &curr_state);
 		if (ret)
 			return ret;
 		mig.device_state = curr_state;
@@ -1566,7 +1566,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
 	}
 
 	/* Handle the VFIO_DEVICE_FEATURE_SET */
-	filp = device->ops->migration_set_state(device, mig.device_state);
+	filp = device->mig_ops->migration_set_state(device, mig.device_state);
 	if (IS_ERR(filp) || !filp)
 		goto out_copy;
 
@@ -1589,8 +1589,7 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
 	};
 	int ret;
 
-	if (!device->ops->migration_set_state ||
-	    !device->ops->migration_get_state)
+	if (!device->mig_ops)
 		return -ENOTTY;
 
 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 49580fa2073a..4d26e149db81 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -32,6 +32,11 @@ struct vfio_device_set {
 struct vfio_device {
 	struct device *dev;
 	const struct vfio_device_ops *ops;
+	/*
+	 * mig_ops is a static property of the vfio_device which must be set
+	 * prior to registering the vfio_device.
+	 */
+	const struct vfio_migration_ops *mig_ops;
 	struct vfio_group *group;
 	struct vfio_device_set *dev_set;
 	struct list_head dev_set_list;
@@ -61,16 +66,6 @@ struct vfio_device {
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
- * @migration_set_state: Optional callback to change the migration state for
- *         devices that support migration. It's mandatory for
- *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
- *         The returned FD is used for data transfer according to the FSM
- *         definition. The driver is responsible to ensure that FD reaches end
- *         of stream or error whenever the migration FSM leaves a data transfer
- *         state or before close_device() returns.
- * @migration_get_state: Optional callback to get the migration state for
- *         devices that support migration. It's mandatory for
- *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
  */
 struct vfio_device_ops {
 	char	*name;
@@ -87,6 +82,21 @@ struct vfio_device_ops {
 	int	(*match)(struct vfio_device *vdev, char *buf);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
+};
+
+/**
+ * @migration_set_state: Optional callback to change the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ *         The returned FD is used for data transfer according to the FSM
+ *         definition. The driver is responsible to ensure that FD reaches end
+ *         of stream or error whenever the migration FSM leaves a data transfer
+ *         state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ *         devices that support migration. It's mandatory for
+ *         VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ */
+struct vfio_migration_ops {
 	struct file *(*migration_set_state)(
 		struct vfio_device *device,
 		enum vfio_device_mig_state new_state);
-- 
cgit v1.2.3


From 0e862838f290147ea9c16db852d8d494b552d38d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:07 +0200
Subject: bitops: unify non-atomic bitops prototypes across architectures

Currently, there is a mess with the prototypes of the non-atomic
bitops across the different architectures:

ret	bool, int, unsigned long
nr	int, long, unsigned int, unsigned long
addr	volatile unsigned long *, volatile void *

Thankfully, it doesn't provoke any bugs, but can sometimes make
the compiler angry when it's not handy at all.
Adjust all the prototypes to the following standard:

ret	bool				retval can be only 0 or 1
nr	unsigned long			native; signed makes no sense
addr	volatile unsigned long *	bitmaps are arrays of ulongs

Next, some architectures don't define 'arch_' versions as they don't
support instrumentation, others do. To make sure there is always the
same set of callables present and to ease any potential future
changes, make them all follow the rule:
 * architecture-specific files define only 'arch_' versions;
 * non-prefixed versions can be defined only in asm-generic files;
and place the non-prefixed definitions into a new file in
asm-generic to be included by non-instrumented architectures.

Finally, add some static assertions in order to prevent people from
making a mess in this room again.
I also used the %__always_inline attribute consistently, so that
they always get resolved to the actual operations.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/alpha/include/asm/bitops.h                    | 32 ++++++------
 arch/hexagon/include/asm/bitops.h                  | 24 +++++----
 arch/ia64/include/asm/bitops.h                     | 42 ++++++++-------
 arch/m68k/include/asm/bitops.h                     | 49 +++++++++++------
 arch/s390/include/asm/bitops.h                     | 61 +++++++++++-----------
 arch/sh/include/asm/bitops-op32.h                  | 34 +++++++-----
 arch/x86/include/asm/bitops.h                      | 22 ++++----
 include/asm-generic/bitops/generic-non-atomic.h    | 24 ++++-----
 .../asm-generic/bitops/instrumented-non-atomic.h   | 21 +++++---
 include/asm-generic/bitops/non-atomic.h            | 13 +----
 .../bitops/non-instrumented-non-atomic.h           | 16 ++++++
 include/linux/bitops.h                             | 17 ++++++
 tools/include/asm-generic/bitops/non-atomic.h      | 24 +++++----
 13 files changed, 229 insertions(+), 150 deletions(-)
 create mode 100644 include/asm-generic/bitops/non-instrumented-non-atomic.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index e1d8483a45f2..492c7713ddae 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -46,8 +46,8 @@ set_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static inline void
-__set_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -82,8 +82,8 @@ clear_bit_unlock(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ void
-__clear_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -94,7 +94,7 @@ static inline void
 __clear_bit_unlock(unsigned long nr, volatile void * addr)
 {
 	smp_mb();
-	__clear_bit(nr, addr);
+	arch___clear_bit(nr, addr);
 }
 
 static inline void
@@ -118,8 +118,8 @@ change_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ void
-__change_bit(unsigned long nr, volatile void * addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	int *m = ((int *) addr) + (nr >> 5);
 
@@ -186,8 +186,8 @@ test_and_set_bit_lock(unsigned long nr, volatile void *addr)
 /*
  * WARNING: non atomic version.
  */
-static inline int
-__test_and_set_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -230,8 +230,8 @@ test_and_clear_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static inline int
-__test_and_clear_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -272,8 +272,8 @@ test_and_change_bit(unsigned long nr, volatile void * addr)
 /*
  * WARNING: non atomic version.
  */
-static __inline__ int
-__test_and_change_bit(unsigned long nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = 1 << (nr & 0x1f);
 	int *m = ((int *) addr) + (nr >> 5);
@@ -283,8 +283,8 @@ __test_and_change_bit(unsigned long nr, volatile void * addr)
 	return (old & mask) != 0;
 }
 
-static inline int
-test_bit(int nr, const volatile void * addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return (1UL & (((const int *) addr)[nr >> 5] >> (nr & 31))) != 0UL;
 }
@@ -450,6 +450,8 @@ sched_find_first_bit(const unsigned long b[2])
 	return __ffs(tmp) + ofs;
 }
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h
index 75d6ba3643b8..da500471ac73 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -127,38 +127,45 @@ static inline void change_bit(int nr, volatile void *addr)
  * be atomic, particularly for things like slab_lock and slab_unlock.
  *
  */
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_clear_bit(nr, addr);
 }
 
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_set_bit(nr, addr);
 }
 
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	test_and_change_bit(nr, addr);
 }
 
 /*  Apparently, at least some of these are allowed to be non-atomic  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_clear_bit(nr, addr);
 }
 
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_set_bit(nr, addr);
 }
 
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	return test_and_change_bit(nr, addr);
 }
 
-static inline int __test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	int retval;
 
@@ -172,8 +179,6 @@ static inline int __test_bit(int nr, const volatile unsigned long *addr)
 	return retval;
 }
 
-#define test_bit(nr, addr) __test_bit(nr, addr)
-
 /*
  * ffz - find first zero in word.
  * @word: The word to search
@@ -271,6 +276,7 @@ static inline unsigned long __fls(unsigned long word)
 }
 
 #include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index 577be93c0818..9f62af7fd7c4 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -53,7 +53,7 @@ set_bit (int nr, volatile void *addr)
 }
 
 /**
- * __set_bit - Set a bit in memory
+ * arch___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -61,8 +61,8 @@ set_bit (int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__set_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
 }
@@ -135,7 +135,7 @@ __clear_bit_unlock(int nr, void *addr)
 }
 
 /**
- * __clear_bit - Clears a bit in memory (non-atomic version)
+ * arch___clear_bit - Clears a bit in memory (non-atomic version)
  * @nr: the bit to clear
  * @addr: the address to start counting from
  *
@@ -143,8 +143,8 @@ __clear_bit_unlock(int nr, void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__clear_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31));
 }
@@ -175,7 +175,7 @@ change_bit (int nr, volatile void *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * arch___change_bit - Toggle a bit in memory
  * @nr: the bit to toggle
  * @addr: the address to start counting from
  *
@@ -183,8 +183,8 @@ change_bit (int nr, volatile void *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __inline__ void
-__change_bit (int nr, volatile void *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	*((__u32 *) addr + (nr >> 5)) ^= (1 << (nr & 31));
 }
@@ -224,7 +224,7 @@ test_and_set_bit (int nr, volatile void *addr)
 #define test_and_set_bit_lock test_and_set_bit
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * arch___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -232,8 +232,8 @@ test_and_set_bit (int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __inline__ int
-__test_and_set_bit (int nr, volatile void *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 *p = (__u32 *) addr + (nr >> 5);
 	__u32 m = 1 << (nr & 31);
@@ -269,7 +269,7 @@ test_and_clear_bit (int nr, volatile void *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * arch___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -277,8 +277,8 @@ test_and_clear_bit (int nr, volatile void *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __inline__ int
-__test_and_clear_bit(int nr, volatile void * addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 *p = (__u32 *) addr + (nr >> 5);
 	__u32 m = 1 << (nr & 31);
@@ -314,14 +314,14 @@ test_and_change_bit (int nr, volatile void *addr)
 }
 
 /**
- * __test_and_change_bit - Change a bit and return its old value
+ * arch___test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
  * This operation is non-atomic and can be reordered.
  */
-static __inline__ int
-__test_and_change_bit (int nr, void *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__u32 old, bit = (1 << (nr & 31));
 	__u32 *m = (__u32 *) addr + (nr >> 5);
@@ -331,8 +331,8 @@ __test_and_change_bit (int nr, void *addr)
 	return (old & bit) != 0;
 }
 
-static __inline__ int
-test_bit (int nr, const volatile void *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1 & (((const volatile __u32 *) addr)[nr >> 5] >> (nr & 31));
 }
@@ -443,6 +443,8 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x)
 
 #ifdef __KERNEL__
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 51283db53667..71495faf2a90 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -65,8 +65,11 @@ static inline void bfset_mem_set_bit(int nr, volatile unsigned long *vaddr)
 				bfset_mem_set_bit(nr, vaddr))
 #endif
 
-#define __set_bit(nr, vaddr)	set_bit(nr, vaddr)
-
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	set_bit(nr, addr);
+}
 
 static inline void bclr_reg_clear_bit(int nr, volatile unsigned long *vaddr)
 {
@@ -105,8 +108,11 @@ static inline void bfclr_mem_clear_bit(int nr, volatile unsigned long *vaddr)
 				bfclr_mem_clear_bit(nr, vaddr))
 #endif
 
-#define __clear_bit(nr, vaddr)	clear_bit(nr, vaddr)
-
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	clear_bit(nr, addr);
+}
 
 static inline void bchg_reg_change_bit(int nr, volatile unsigned long *vaddr)
 {
@@ -145,14 +151,17 @@ static inline void bfchg_mem_change_bit(int nr, volatile unsigned long *vaddr)
 				bfchg_mem_change_bit(nr, vaddr))
 #endif
 
-#define __change_bit(nr, vaddr)	change_bit(nr, vaddr)
-
-
-static inline int test_bit(int nr, const volatile unsigned long *vaddr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	return (vaddr[nr >> 5] & (1UL << (nr & 31))) != 0;
+	change_bit(nr, addr);
 }
 
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	return (addr[nr >> 5] & (1UL << (nr & 31))) != 0;
+}
 
 static inline int bset_reg_test_and_set_bit(int nr,
 					    volatile unsigned long *vaddr)
@@ -201,8 +210,11 @@ static inline int bfset_mem_test_and_set_bit(int nr,
 					bfset_mem_test_and_set_bit(nr, vaddr))
 #endif
 
-#define __test_and_set_bit(nr, vaddr)	test_and_set_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_set_bit(nr, addr);
+}
 
 static inline int bclr_reg_test_and_clear_bit(int nr,
 					      volatile unsigned long *vaddr)
@@ -251,8 +263,11 @@ static inline int bfclr_mem_test_and_clear_bit(int nr,
 					bfclr_mem_test_and_clear_bit(nr, vaddr))
 #endif
 
-#define __test_and_clear_bit(nr, vaddr)	test_and_clear_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_clear_bit(nr, addr);
+}
 
 static inline int bchg_reg_test_and_change_bit(int nr,
 					       volatile unsigned long *vaddr)
@@ -301,8 +316,11 @@ static inline int bfchg_mem_test_and_change_bit(int nr,
 					bfchg_mem_test_and_change_bit(nr, vaddr))
 #endif
 
-#define __test_and_change_bit(nr, vaddr) test_and_change_bit(nr, vaddr)
-
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+{
+	return test_and_change_bit(nr, addr);
+}
 
 /*
  *	The true 68020 and more advanced processors support the "bfffo"
@@ -522,6 +540,7 @@ static inline int __fls(int x)
 #define clear_bit_unlock	clear_bit
 #define __clear_bit_unlock	clear_bit_unlock
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 #include <asm-generic/bitops/ext2-atomic.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/sched.h>
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 191dc7898b0f..9a7d15da966e 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -113,75 +113,76 @@ static inline bool arch_test_and_change_bit(unsigned long nr,
 	return old & mask;
 }
 
-static inline void arch___set_bit(unsigned long nr, volatile unsigned long *ptr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr |= mask;
+	*p |= mask;
 }
 
-static inline void arch___clear_bit(unsigned long nr,
-				    volatile unsigned long *ptr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr &= ~mask;
+	*p &= ~mask;
 }
 
-static inline void arch___change_bit(unsigned long nr,
-				     volatile unsigned long *ptr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	*addr ^= mask;
+	*p ^= mask;
 }
 
-static inline bool arch___test_and_set_bit(unsigned long nr,
-					   volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr |= mask;
+	old = *p;
+	*p |= mask;
 	return old & mask;
 }
 
-static inline bool arch___test_and_clear_bit(unsigned long nr,
-					     volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr &= ~mask;
+	old = *p;
+	*p &= ~mask;
 	return old & mask;
 }
 
-static inline bool arch___test_and_change_bit(unsigned long nr,
-					      volatile unsigned long *ptr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
-	unsigned long *addr = __bitops_word(nr, ptr);
+	unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 	unsigned long old;
 
-	old = *addr;
-	*addr ^= mask;
+	old = *p;
+	*p ^= mask;
 	return old & mask;
 }
 
-static inline bool arch_test_bit(unsigned long nr,
-				 const volatile unsigned long *ptr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
-	const volatile unsigned long *addr = __bitops_word(nr, ptr);
+	const volatile unsigned long *p = __bitops_word(nr, addr);
 	unsigned long mask = __bitops_mask(nr);
 
-	return *addr & mask;
+	return *p & mask;
 }
 
 static inline bool arch_test_and_set_bit_lock(unsigned long nr,
diff --git a/arch/sh/include/asm/bitops-op32.h b/arch/sh/include/asm/bitops-op32.h
index cfe5465acce7..565a85d8b7fb 100644
--- a/arch/sh/include/asm/bitops-op32.h
+++ b/arch/sh/include/asm/bitops-op32.h
@@ -2,6 +2,8 @@
 #ifndef __ASM_SH_BITOPS_OP32_H
 #define __ASM_SH_BITOPS_OP32_H
 
+#include <linux/bits.h>
+
 /*
  * The bit modifying instructions on SH-2A are only capable of working
  * with a 3-bit immediate, which signifies the shift position for the bit
@@ -16,7 +18,8 @@
 #define BYTE_OFFSET(nr)		((nr) % BITS_PER_BYTE)
 #endif
 
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -33,7 +36,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 	}
 }
 
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -52,7 +56,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * arch___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -60,7 +64,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	if (__builtin_constant_p(nr)) {
 		__asm__ __volatile__ (
@@ -79,7 +84,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * arch___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -87,7 +92,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -98,7 +104,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * arch___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -106,7 +112,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -117,8 +124,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-					    volatile unsigned long *addr)
+static __always_inline bool
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -129,13 +136,16 @@ static inline int __test_and_change_bit(int nr,
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * arch_test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
+
 #endif /* __ASM_SH_BITOPS_OP32_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index a288ecd230ab..973c6bd17f98 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -63,7 +63,7 @@ arch_set_bit(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___set_bit(long nr, volatile unsigned long *addr)
+arch___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -89,7 +89,7 @@ arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___clear_bit(long nr, volatile unsigned long *addr)
+arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -114,7 +114,7 @@ arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-arch___change_bit(long nr, volatile unsigned long *addr)
+arch___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
@@ -145,7 +145,7 @@ arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline bool
-arch___test_and_set_bit(long nr, volatile unsigned long *addr)
+arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -171,7 +171,7 @@ arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
  * this without also updating arch/x86/kernel/kvm.c
  */
 static __always_inline bool
-arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
+arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -183,7 +183,7 @@ arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 static __always_inline bool
-arch___test_and_change_bit(long nr, volatile unsigned long *addr)
+arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	bool oldbit;
 
@@ -219,10 +219,12 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 	return oldbit;
 }
 
-#define arch_test_bit(nr, addr)			\
-	(__builtin_constant_p((nr))		\
-	 ? constant_test_bit((nr), (addr))	\
-	 : variable_test_bit((nr), (addr)))
+static __always_inline bool
+arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) :
+					  variable_test_bit(nr, addr);
+}
 
 /**
  * __ffs - find first set bit in word
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 7226488810e5..b85b8a2ac239 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -24,7 +24,7 @@
  * may be that only one operation succeeds.
  */
 static __always_inline void
-generic___set_bit(unsigned int nr, volatile unsigned long *addr)
+generic___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -33,7 +33,7 @@ generic___set_bit(unsigned int nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-generic___clear_bit(unsigned int nr, volatile unsigned long *addr)
+generic___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -50,8 +50,8 @@ generic___clear_bit(unsigned int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static __always_inline
-void generic___change_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline void
+generic___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -68,8 +68,8 @@ void generic___change_bit(unsigned int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __always_inline int
-generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -88,8 +88,8 @@ generic___test_and_set_bit(unsigned int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static __always_inline int
-generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -100,8 +100,8 @@ generic___test_and_clear_bit(unsigned int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static __always_inline int
-generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
+static __always_inline bool
+generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -116,8 +116,8 @@ generic___test_and_change_bit(unsigned int nr, volatile unsigned long *addr)
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static __always_inline int
-generic_test_bit(unsigned int nr, const volatile unsigned long *addr)
+static __always_inline bool
+generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	/*
 	 * Unlike the bitops with the '__' prefix above, this one *is* atomic,
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 7ab1ecc37782..b019f77ef21c 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -22,7 +22,8 @@
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___set_bit(nr, addr);
@@ -37,7 +38,8 @@ static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___clear_bit(nr, addr);
@@ -52,7 +54,8 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
  * region of memory concurrently, the effect may be that only one operation
  * succeeds.
  */
-static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void
+__change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___change_bit(nr, addr);
@@ -90,7 +93,8 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_set_bit(nr, addr);
@@ -104,7 +108,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_clear_bit(nr, addr);
@@ -118,7 +123,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
  * This operation is non-atomic. If two instances of this operation race, one
  * can appear to succeed but actually fail.
  */
-static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_change_bit(nr, addr);
@@ -129,7 +135,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static __always_inline bool test_bit(long nr, const volatile unsigned long *addr)
+static __always_inline bool
+test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
 	return arch_test_bit(nr, addr);
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 23d3abc1e10d..5c37ced343ae 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -5,24 +5,15 @@
 #include <asm-generic/bitops/generic-non-atomic.h>
 
 #define arch___set_bit generic___set_bit
-#define __set_bit arch___set_bit
-
 #define arch___clear_bit generic___clear_bit
-#define __clear_bit arch___clear_bit
-
 #define arch___change_bit generic___change_bit
-#define __change_bit arch___change_bit
 
 #define arch___test_and_set_bit generic___test_and_set_bit
-#define __test_and_set_bit arch___test_and_set_bit
-
 #define arch___test_and_clear_bit generic___test_and_clear_bit
-#define __test_and_clear_bit arch___test_and_clear_bit
-
 #define arch___test_and_change_bit generic___test_and_change_bit
-#define __test_and_change_bit arch___test_and_change_bit
 
 #define arch_test_bit generic_test_bit
-#define test_bit arch_test_bit
+
+#include <asm-generic/bitops/non-instrumented-non-atomic.h>
 
 #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
new file mode 100644
index 000000000000..e0fd7bf72a56
--- /dev/null
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
+#define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
+
+#define __set_bit		arch___set_bit
+#define __clear_bit		arch___clear_bit
+#define __change_bit		arch___change_bit
+
+#define __test_and_set_bit	arch___test_and_set_bit
+#define __test_and_clear_bit	arch___test_and_clear_bit
+#define __test_and_change_bit	arch___test_and_change_bit
+
+#define test_bit		arch_test_bit
+
+#endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 7aaed501f768..87087454a288 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -26,12 +26,29 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+#include <asm-generic/bitops/generic-non-atomic.h>
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
+/* Check that the bitops prototypes are sane */
+#define __check_bitop_pr(name)						\
+	static_assert(__same_type(arch_##name, generic_##name) &&	\
+		      __same_type(name, generic_##name))
+
+__check_bitop_pr(__set_bit);
+__check_bitop_pr(__clear_bit);
+__check_bitop_pr(__change_bit);
+__check_bitop_pr(__test_and_set_bit);
+__check_bitop_pr(__test_and_clear_bit);
+__check_bitop_pr(__test_and_change_bit);
+__check_bitop_pr(test_bit);
+
+#undef __check_bitop_pr
+
 static inline int get_bitmask_order(unsigned int count)
 {
 	int order;
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
index 7e10c4b50c5d..e5e78e42e57b 100644
--- a/tools/include/asm-generic/bitops/non-atomic.h
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
 
-#include <asm/types.h>
+#include <linux/bits.h>
 
 /**
  * __set_bit - Set a bit in memory
@@ -13,7 +13,8 @@
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -21,7 +22,8 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
 	*p  |= mask;
 }
 
-static inline void __clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -38,7 +40,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile unsigned long *addr)
+static __always_inline void
+__change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -55,7 +58,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -74,7 +78,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __always_inline bool
+__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -85,8 +90,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr,
-					    volatile unsigned long *addr)
+static __always_inline bool
+__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -101,7 +106,8 @@ static inline int __test_and_change_bit(int nr,
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
-static inline int test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline bool
+test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
-- 
cgit v1.2.3


From bb7379bfa680bd48b468e856475778db2ad866c1 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:08 +0200
Subject: bitops: define const_*() versions of the non-atomics

Define const_*() variants of the non-atomic bitops to be used when
the input arguments are compile-time constants, so that the compiler
will be always able to resolve those to compile-time constants as
well. Those are mostly direct aliases for generic_*() with one
exception for const_test_bit(): the original one is declared
atomic-safe and thus doesn't discard the `volatile` qualifier, so
in order to let optimize code, define it separately disregarding
the qualifier.
Add them to the compile-time type checks as well just in case.

Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/asm-generic/bitops/generic-non-atomic.h | 31 +++++++++++++++++++++++++
 include/linux/bitops.h                          |  1 +
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index b85b8a2ac239..3d5ebd24652b 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -127,4 +127,35 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+/*
+ * const_*() definitions provide good compile-time optimizations when
+ * the passed arguments can be resolved at compile time.
+ */
+#define const___set_bit			generic___set_bit
+#define const___clear_bit		generic___clear_bit
+#define const___change_bit		generic___change_bit
+#define const___test_and_set_bit	generic___test_and_set_bit
+#define const___test_and_clear_bit	generic___test_and_clear_bit
+#define const___test_and_change_bit	generic___test_and_change_bit
+
+/**
+ * const_test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ *
+ * A version of generic_test_bit() which discards the `volatile` qualifier to
+ * allow a compiler to optimize code harder. Non-atomic and to be called only
+ * for testing compile-time constants, e.g. by the corresponding macros, not
+ * directly from "regular" code.
+ */
+static __always_inline bool
+const_test_bit(unsigned long nr, const volatile unsigned long *addr)
+{
+	const unsigned long *p = (const unsigned long *)addr + BIT_WORD(nr);
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long val = *p;
+
+	return !!(val & mask);
+}
+
 #endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 87087454a288..d393297287d5 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -37,6 +37,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 /* Check that the bitops prototypes are sane */
 #define __check_bitop_pr(name)						\
 	static_assert(__same_type(arch_##name, generic_##name) &&	\
+		      __same_type(const_##name, generic_##name) &&	\
 		      __same_type(name, generic_##name))
 
 __check_bitop_pr(__set_bit);
-- 
cgit v1.2.3


From e69eb9c460f128b71c6b995d75a05244e4b6cc3e Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:09 +0200
Subject: bitops: wrap non-atomic bitops with a transparent macro

In preparation for altering the non-atomic bitops with a macro, wrap
them in a transparent definition. This requires prepending one more
'_' to their names in order to be able to do that seamlessly. It is
a simple change, given that all the non-prefixed definitions are now
in asm-generic.
sparc32 already has several triple-underscored functions, so I had
to rename them ('___' -> 'sp32_').

Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/sparc/include/asm/bitops_32.h                 | 18 +++++++-------
 arch/sparc/lib/atomic32.c                          | 12 +++++-----
 .../asm-generic/bitops/instrumented-non-atomic.h   | 28 +++++++++++-----------
 .../bitops/non-instrumented-non-atomic.h           | 14 +++++------
 include/linux/bitops.h                             | 18 +++++++++++++-
 tools/include/asm-generic/bitops/non-atomic.h      | 24 +++++++++----------
 tools/include/linux/bitops.h                       | 16 +++++++++++++
 7 files changed, 81 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h
index 889afa9f990f..3448c191b484 100644
--- a/arch/sparc/include/asm/bitops_32.h
+++ b/arch/sparc/include/asm/bitops_32.h
@@ -19,9 +19,9 @@
 #error only <linux/bitops.h> can be included directly
 #endif
 
-unsigned long ___set_bit(unsigned long *addr, unsigned long mask);
-unsigned long ___clear_bit(unsigned long *addr, unsigned long mask);
-unsigned long ___change_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask);
+unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask);
 
 /*
  * Set bit 'nr' in 32-bit quantity at address 'addr' where bit '0'
@@ -36,7 +36,7 @@ static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *add
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___set_bit(ADDR, mask) != 0;
+	return sp32___set_bit(ADDR, mask) != 0;
 }
 
 static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
@@ -46,7 +46,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___set_bit(ADDR, mask);
+	(void) sp32___set_bit(ADDR, mask);
 }
 
 static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
@@ -56,7 +56,7 @@ static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *a
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___clear_bit(ADDR, mask) != 0;
+	return sp32___clear_bit(ADDR, mask) != 0;
 }
 
 static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
@@ -66,7 +66,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___clear_bit(ADDR, mask);
+	(void) sp32___clear_bit(ADDR, mask);
 }
 
 static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
@@ -76,7 +76,7 @@ static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	return ___change_bit(ADDR, mask) != 0;
+	return sp32___change_bit(ADDR, mask) != 0;
 }
 
 static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
@@ -86,7 +86,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
 	ADDR = ((unsigned long *) addr) + (nr >> 5);
 	mask = 1 << (nr & 31);
 
-	(void) ___change_bit(ADDR, mask);
+	(void) sp32___change_bit(ADDR, mask);
 }
 
 #include <asm-generic/bitops/non-atomic.h>
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 8b81d0f00c97..cf80d1ae352b 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -120,7 +120,7 @@ void arch_atomic_set(atomic_t *v, int i)
 }
 EXPORT_SYMBOL(arch_atomic_set);
 
-unsigned long ___set_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___set_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -131,9 +131,9 @@ unsigned long ___set_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___set_bit);
+EXPORT_SYMBOL(sp32___set_bit);
 
-unsigned long ___clear_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___clear_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -144,9 +144,9 @@ unsigned long ___clear_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___clear_bit);
+EXPORT_SYMBOL(sp32___clear_bit);
 
-unsigned long ___change_bit(unsigned long *addr, unsigned long mask)
+unsigned long sp32___change_bit(unsigned long *addr, unsigned long mask)
 {
 	unsigned long old, flags;
 
@@ -157,7 +157,7 @@ unsigned long ___change_bit(unsigned long *addr, unsigned long mask)
 
 	return old & mask;
 }
-EXPORT_SYMBOL(___change_bit);
+EXPORT_SYMBOL(sp32___change_bit);
 
 unsigned long __cmpxchg_u32(volatile u32 *ptr, u32 old, u32 new)
 {
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index b019f77ef21c..988a3bbfba34 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -14,7 +14,7 @@
 #include <linux/instrumented.h>
 
 /**
- * __set_bit - Set a bit in memory
+ * ___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -23,14 +23,14 @@
  * succeeds.
  */
 static __always_inline void
-__set_bit(unsigned long nr, volatile unsigned long *addr)
+___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___set_bit(nr, addr);
 }
 
 /**
- * __clear_bit - Clears a bit in memory
+ * ___clear_bit - Clears a bit in memory
  * @nr: the bit to clear
  * @addr: the address to start counting from
  *
@@ -39,14 +39,14 @@ __set_bit(unsigned long nr, volatile unsigned long *addr)
  * succeeds.
  */
 static __always_inline void
-__clear_bit(unsigned long nr, volatile unsigned long *addr)
+___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___clear_bit(nr, addr);
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * ___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -55,7 +55,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
  * succeeds.
  */
 static __always_inline void
-__change_bit(unsigned long nr, volatile unsigned long *addr)
+___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	instrument_write(addr + BIT_WORD(nr), sizeof(long));
 	arch___change_bit(nr, addr);
@@ -86,7 +86,7 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * ___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -94,14 +94,14 @@ static __always_inline void __instrument_read_write_bitop(long nr, volatile unsi
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_set_bit(nr, addr);
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * ___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -109,14 +109,14 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_clear_bit(nr, addr);
 }
 
 /**
- * __test_and_change_bit - Change a bit and return its old value
+ * ___test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
@@ -124,19 +124,19 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
  * can appear to succeed but actually fail.
  */
 static __always_inline bool
-__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	__instrument_read_write_bitop(nr, addr);
 	return arch___test_and_change_bit(nr, addr);
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * _test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
 static __always_inline bool
-test_bit(unsigned long nr, const volatile unsigned long *addr)
+_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
 	return arch_test_bit(nr, addr);
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index e0fd7bf72a56..bdb9b1ffaee9 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -3,14 +3,14 @@
 #ifndef __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
 #define __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H
 
-#define __set_bit		arch___set_bit
-#define __clear_bit		arch___clear_bit
-#define __change_bit		arch___change_bit
+#define ___set_bit		arch___set_bit
+#define ___clear_bit		arch___clear_bit
+#define ___change_bit		arch___change_bit
 
-#define __test_and_set_bit	arch___test_and_set_bit
-#define __test_and_clear_bit	arch___test_and_clear_bit
-#define __test_and_change_bit	arch___test_and_change_bit
+#define ___test_and_set_bit	arch___test_and_set_bit
+#define ___test_and_clear_bit	arch___test_and_clear_bit
+#define ___test_and_change_bit	arch___test_and_change_bit
 
-#define test_bit		arch_test_bit
+#define _test_bit		arch_test_bit
 
 #endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index d393297287d5..3c3afbae1533 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -26,8 +26,24 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+/*
+ * Defined here because those may be needed by architecture-specific static
+ * inlines.
+ */
+
 #include <asm-generic/bitops/generic-non-atomic.h>
 
+#define bitop(op, nr, addr)						\
+	op(nr, addr)
+
+#define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
+#define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
+#define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
+#define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
+#define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
+#define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+#define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
@@ -38,7 +54,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 #define __check_bitop_pr(name)						\
 	static_assert(__same_type(arch_##name, generic_##name) &&	\
 		      __same_type(const_##name, generic_##name) &&	\
-		      __same_type(name, generic_##name))
+		      __same_type(_##name, generic_##name))
 
 __check_bitop_pr(__set_bit);
 __check_bitop_pr(__clear_bit);
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
index e5e78e42e57b..0c472a833408 100644
--- a/tools/include/asm-generic/bitops/non-atomic.h
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -5,7 +5,7 @@
 #include <linux/bits.h>
 
 /**
- * __set_bit - Set a bit in memory
+ * ___set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
@@ -14,7 +14,7 @@
  * may be that only one operation succeeds.
  */
 static __always_inline void
-__set_bit(unsigned long nr, volatile unsigned long *addr)
+___set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -23,7 +23,7 @@ __set_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 static __always_inline void
-__clear_bit(unsigned long nr, volatile unsigned long *addr)
+___clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -32,7 +32,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __change_bit - Toggle a bit in memory
+ * ___change_bit - Toggle a bit in memory
  * @nr: the bit to change
  * @addr: the address to start counting from
  *
@@ -41,7 +41,7 @@ __clear_bit(unsigned long nr, volatile unsigned long *addr)
  * may be that only one operation succeeds.
  */
 static __always_inline void
-__change_bit(unsigned long nr, volatile unsigned long *addr)
+___change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -50,7 +50,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_set_bit - Set a bit and return its old value
+ * ___test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
@@ -59,7 +59,7 @@ __change_bit(unsigned long nr, volatile unsigned long *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static __always_inline bool
-__test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -70,7 +70,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * __test_and_clear_bit - Clear a bit and return its old value
+ * ___test_and_clear_bit - Clear a bit and return its old value
  * @nr: Bit to clear
  * @addr: Address to count from
  *
@@ -79,7 +79,7 @@ __test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
  * but actually fail.  You must protect multiple accesses with a lock.
  */
 static __always_inline bool
-__test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -91,7 +91,7 @@ __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
 
 /* WARNING: non atomic and it can be reordered! */
 static __always_inline bool
-__test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
+___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
 	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
@@ -102,12 +102,12 @@ __test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
 }
 
 /**
- * test_bit - Determine whether a bit is set
+ * _test_bit - Determine whether a bit is set
  * @nr: bit number to test
  * @addr: Address to start counting from
  */
 static __always_inline bool
-test_bit(unsigned long nr, const volatile unsigned long *addr)
+_test_bit(unsigned long nr, const volatile unsigned long *addr)
 {
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index 5fca38fe1ba8..f18683b95ea6 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -25,6 +25,22 @@ extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
 extern unsigned long __sw_hweight64(__u64 w);
 
+/*
+ * Defined here because those may be needed by architecture-specific static
+ * inlines.
+ */
+
+#define bitop(op, nr, addr)						\
+	op(nr, addr)
+
+#define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
+#define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
+#define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
+#define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
+#define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
+#define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+#define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
-- 
cgit v1.2.3


From b03fc1173c0c2bb8fad61902a862985cecdc4b1b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:10 +0200
Subject: bitops: let optimize out non-atomic bitops on compile-time constants

Currently, many architecture-specific non-atomic bitop
implementations use inline asm or other hacks which are faster or
more robust when working with "real" variables (i.e. fields from
the structures etc.), but the compilers have no clue how to optimize
them out when called on compile-time constants. That said, the
following code:

	DECLARE_BITMAP(foo, BITS_PER_LONG) = { }; // -> unsigned long foo[1];
	unsigned long bar = BIT(BAR_BIT);
	unsigned long baz = 0;

	__set_bit(FOO_BIT, foo);
	baz |= BIT(BAZ_BIT);

	BUILD_BUG_ON(!__builtin_constant_p(test_bit(FOO_BIT, foo));
	BUILD_BUG_ON(!__builtin_constant_p(bar & BAR_BIT));
	BUILD_BUG_ON(!__builtin_constant_p(baz & BAZ_BIT));

triggers the first assertion on x86_64, which means that the
compiler is unable to evaluate it to a compile-time initializer
when the architecture-specific bitop is used even if it's obvious.
In order to let the compiler optimize out such cases, expand the
bitop() macro to use the "constant" C non-atomic bitop
implementations when all of the arguments passed are compile-time
constants, which means that the result will be a compile-time
constant as well, so that it produces more efficient and simple
code in 100% cases, comparing to the architecture-specific
counterparts.

The savings are architecture, compiler and compiler flags dependent,
for example, on x86_64 -O2:

GCC 12: add/remove: 78/29 grow/shrink: 332/525 up/down: 31325/-61560 (-30235)
LLVM 13: add/remove: 79/76 grow/shrink: 184/537 up/down: 55076/-141892 (-86816)
LLVM 14: add/remove: 10/3 grow/shrink: 93/138 up/down: 3705/-6992 (-3287)

and ARM64 (courtesy of Mark):

GCC 11: add/remove: 92/29 grow/shrink: 933/2766 up/down: 39340/-82580 (-43240)
LLVM 14: add/remove: 21/11 grow/shrink: 620/651 up/down: 12060/-15824 (-3764)

Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Reviewed-by: Marco Elver <elver@google.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 3c3afbae1533..cf9bf65039f2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -33,8 +33,24 @@ extern unsigned long __sw_hweight64(__u64 w);
 
 #include <asm-generic/bitops/generic-non-atomic.h>
 
+/*
+ * Many architecture-specific non-atomic bitops contain inline asm code and due
+ * to that the compiler can't optimize them to compile-time expressions or
+ * constants. In contrary, generic_*() helpers are defined in pure C and
+ * compilers optimize them just well.
+ * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively
+ * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when
+ * the arguments can be resolved at compile time. That expression itself is a
+ * constant and doesn't bring any functional changes to the rest of cases.
+ * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when
+ * passing a bitmap from .bss or .data (-> `!!addr` is always true).
+ */
 #define bitop(op, nr, addr)						\
-	op(nr, addr)
+	((__builtin_constant_p(nr) &&					\
+	  __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) &&	\
+	  (uintptr_t)(addr) != (uintptr_t)NULL &&			\
+	  __builtin_constant_p(*(const unsigned long *)(addr))) ?	\
+	 const##op(nr, addr) : op(nr, addr))
 
 #define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
 #define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
-- 
cgit v1.2.3


From 3e7e5baaaba78075a7f3a57432609e363bf2a486 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alexandr.lobakin@intel.com>
Date: Fri, 24 Jun 2022 14:13:12 +0200
Subject: bitmap: don't assume compiler evaluates small mem*() builtins calls

Intel kernel bot triggered the build bug on ARC architecture that
in fact is as follows:

	DECLARE_BITMAP(bitmap, BITS_PER_LONG);

	bitmap_clear(bitmap, 0, BITS_PER_LONG);
	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

which can be expanded to:

	unsigned long bitmap[1];

	memset(bitmap, 0, sizeof(*bitmap));
	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

In most cases, a compiler is able to expand small/simple mem*()
calls to simple assignments or bitops, in this case that would mean:

	unsigned long bitmap[1] = { 0 };

	BUILD_BUG_ON(!__builtin_constant_p(*bitmap));

and on most architectures this works, but not on ARC, despite having
-O3 for every build.
So, to make this work, in case when the last bit to modify is still
within the first long (small_const_nbits()), just use plain
assignments for the rest of bitmap_*() functions which still use
mem*(), but didn't receive such compile-time optimizations yet.
This doesn't have the same coverage as compilers provide, but at
least something to start:

text: add/remove: 3/7 grow/shrink: 43/78 up/down: 1848/-3370 (-1546)
data: add/remove: 1/11 grow/shrink: 0/8 up/down: 4/-356 (-352)

notably cpumask_*() family when NR_CPUS <= BITS_PER_LONG:

netif_get_num_default_rss_queues              38       4     -34
cpumask_copy                                  90       -     -90
cpumask_clear                                146       -    -146

and the abovementioned assertion started passing.

Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index f091a1664bf1..c91638e507f2 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -238,20 +238,32 @@ extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memset(dst, 0, len);
+
+	if (small_const_nbits(nbits))
+		*dst = 0;
+	else
+		memset(dst, 0, len);
 }
 
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memset(dst, 0xff, len);
+
+	if (small_const_nbits(nbits))
+		*dst = ~0UL;
+	else
+		memset(dst, 0xff, len);
 }
 
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-	memcpy(dst, src, len);
+
+	if (small_const_nbits(nbits))
+		*dst = *src;
+	else
+		memcpy(dst, src, len);
 }
 
 /*
@@ -431,6 +443,8 @@ static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__set_bit(start, map);
+	else if (small_const_nbits(start + nbits))
+		*map |= GENMASK(start + nbits - 1, start);
 	else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
 		 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
 		 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
@@ -445,6 +459,8 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
 		__clear_bit(start, map);
+	else if (small_const_nbits(start + nbits))
+		*map &= ~GENMASK(start + nbits - 1, start);
 	else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
 		 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
 		 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
-- 
cgit v1.2.3


From eb003bf3ba221bb3d21d1fdcddaa36c158fd2d8f Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 24 Jun 2022 20:36:39 +0200
Subject: platform/surface: aggregator: Add helper macros for requests with
 argument and return value

Add helper macros for synchronous stack-allocated Surface Aggregator
request with both argument and return value, similar to the current
argument-only and return-value-only ones.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220624183642.910893-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/surface_aggregator/controller.h | 125 ++++++++++++++++++++++++++
 include/linux/surface_aggregator/device.h     |  36 ++++++++
 2 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index 50a2b4926c06..d11a1c6e3186 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -469,6 +469,67 @@ struct ssam_request_spec_md {
 		return 0;							\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_WR() - Define synchronous SAM request function with
+ * both argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. The generated function takes care of setting up the request
+ * and response structs, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, const atype *arg, rtype *ret)``, returning the status
+ * of the request, which is zero on success and negative on failure. The
+ * ``ctrl`` parameter is the controller via which the request is sent. The
+ * request argument is specified via the ``arg`` pointer. The request's return
+ * value is written to the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_WR(name, atype, rtype, spec...)		\
+	static int name(struct ssam_controller *ctrl, const atype *arg, rtype *ret) \
+	{									\
+		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = s.target_id;					\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = s.instance_id;				\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, sizeof(atype)); \
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
 /**
  * SSAM_DEFINE_SYNC_REQUEST_MD_N() - Define synchronous multi-device SAM
  * request function with neither argument nor return value.
@@ -613,6 +674,70 @@ struct ssam_request_spec_md {
 		return 0;							\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_MD_WR() - Define synchronous multi-device SAM
+ * request function with both argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. Device specifying parameters are not hard-coded, but instead
+ * must be provided to the function. The generated function takes care of
+ * setting up the request and response structs, buffer allocation, as well as
+ * execution of the request itself, returning once the request has been fully
+ * completed. The required transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg, rtype *ret)``,
+ * returning the status of the request, which is zero on success and negative
+ * on failure. The ``ctrl`` parameter is the controller via which the request
+ * is sent, ``tid`` the target ID for the request, and ``iid`` the instance ID.
+ * The request argument is specified via the ``arg`` pointer. The request's
+ * return value is written to the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_MD_WR(name, atype, rtype, spec...)		\
+	static int name(struct ssam_controller *ctrl, u8 tid, u8 iid,		\
+			const atype *arg, rtype *ret)				\
+	{									\
+		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
+		struct ssam_request rqst;					\
+		struct ssam_response rsp;					\
+		int status;							\
+										\
+		rqst.target_category = s.target_category;			\
+		rqst.target_id = tid;						\
+		rqst.command_id = s.command_id;					\
+		rqst.instance_id = iid;						\
+		rqst.flags = s.flags | SSAM_REQUEST_HAS_RESPONSE;		\
+		rqst.length = sizeof(atype);					\
+		rqst.payload = (u8 *)arg;					\
+										\
+		rsp.capacity = sizeof(rtype);					\
+		rsp.length = 0;							\
+		rsp.pointer = (u8 *)ret;					\
+										\
+		status = ssam_request_sync_onstack(ctrl, &rqst, &rsp, sizeof(atype)); \
+		if (status)							\
+			return status;						\
+										\
+		if (rsp.length != sizeof(rtype)) {				\
+			struct device *dev = ssam_controller_device(ctrl);	\
+			dev_err(dev,						\
+				"rqst: invalid response length, expected %zu, got %zu (tc: %#04x, cid: %#04x)", \
+				sizeof(rtype), rsp.length, rqst.target_category,\
+				rqst.command_id);				\
+			return -EIO;						\
+		}								\
+										\
+		return 0;							\
+	}
+
 
 /* -- Event notifier/callbacks. --------------------------------------------- */
 
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index c418f7f2732d..6cf7e80312d5 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -483,6 +483,42 @@ static inline void ssam_remove_clients(struct device *dev) {}
 				    sdev->uid.instance, ret);		\
 	}
 
+/**
+ * SSAM_DEFINE_SYNC_REQUEST_CL_WR() - Define synchronous client-device SAM
+ * request function with argument and return value.
+ * @name:  Name of the generated function.
+ * @atype: Type of the request's argument.
+ * @rtype: Type of the request's return value.
+ * @spec:  Specification (&struct ssam_request_spec_md) defining the request.
+ *
+ * Defines a function executing the synchronous SAM request specified by @spec,
+ * with the request taking an argument of type @atype and having a return value
+ * of type @rtype. Device specifying parameters are not hard-coded, but instead
+ * are provided via the client device, specifically its UID, supplied when
+ * calling this function. The generated function takes care of setting up the
+ * request struct, buffer allocation, as well as execution of the request
+ * itself, returning once the request has been fully completed. The required
+ * transport buffer will be allocated on the stack.
+ *
+ * The generated function is defined as ``static int name(struct ssam_device
+ * *sdev, const atype *arg, rtype *ret)``, returning the status of the request,
+ * which is zero on success and negative on failure. The ``sdev`` parameter
+ * specifies both the target device of the request and by association the
+ * controller via which the request is sent. The request's argument is
+ * specified via the ``arg`` pointer. The request's return value is written to
+ * the memory pointed to by the ``ret`` parameter.
+ *
+ * Refer to ssam_request_sync_onstack() for more details on the behavior of
+ * the generated function.
+ */
+#define SSAM_DEFINE_SYNC_REQUEST_CL_WR(name, atype, rtype, spec...)		\
+	SSAM_DEFINE_SYNC_REQUEST_MD_WR(__raw_##name, atype, rtype, spec)	\
+	static int name(struct ssam_device *sdev, const atype *arg, rtype *ret)	\
+	{									\
+		return __raw_##name(sdev->ctrl, sdev->uid.target,		\
+				    sdev->uid.instance, arg, ret);		\
+	}
+
 
 /* -- Helpers for client-device notifiers. ---------------------------------- */
 
-- 
cgit v1.2.3


From 4a4ab610b8ae912c28a4fd28442ef24ed7a1a5bd Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Fri, 24 Jun 2022 22:57:58 +0200
Subject: platform/surface: aggregator: Move device registry helper functions
 to core module

Move helper functions for client device registration to the core module.
This simplifies addition of future DT/OF support and also allows us to
split out the device hub drivers into their own module.

At the same time, also improve device node validation a bit by not
silently skipping devices with invalid device UID specifiers. Further,
ensure proper lifetime management for the firmware/software nodes
associated with the added devices.

Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20220624205800.1355621-2-luzmaximilian@gmail.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/surface/aggregator/bus.c          | 149 ++++++++++++++++++---
 .../platform/surface/surface_aggregator_registry.c |  75 +----------
 include/linux/surface_aggregator/device.h          |  52 +++++++
 3 files changed, 187 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c
index abbbb5b08b07..e0b0381a2834 100644
--- a/drivers/platform/surface/aggregator/bus.c
+++ b/drivers/platform/surface/aggregator/bus.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/property.h>
 #include <linux/slab.h>
 
 #include <linux/surface_aggregator/controller.h>
@@ -14,6 +15,9 @@
 #include "bus.h"
 #include "controller.h"
 
+
+/* -- Device and bus functions. --------------------------------------------- */
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
@@ -46,6 +50,7 @@ static void ssam_device_release(struct device *dev)
 	struct ssam_device *sdev = to_ssam_device(dev);
 
 	ssam_controller_put(sdev->ctrl);
+	fwnode_handle_put(sdev->dev.fwnode);
 	kfree(sdev);
 }
 
@@ -363,6 +368,134 @@ void ssam_device_driver_unregister(struct ssam_device_driver *sdrv)
 }
 EXPORT_SYMBOL_GPL(ssam_device_driver_unregister);
 
+
+/* -- Bus registration. ----------------------------------------------------- */
+
+/**
+ * ssam_bus_register() - Register and set-up the SSAM client device bus.
+ */
+int ssam_bus_register(void)
+{
+	return bus_register(&ssam_bus_type);
+}
+
+/**
+ * ssam_bus_unregister() - Unregister the SSAM client device bus.
+ */
+void ssam_bus_unregister(void)
+{
+	return bus_unregister(&ssam_bus_type);
+}
+
+
+/* -- Helpers for controller and hub devices. ------------------------------- */
+
+static int ssam_device_uid_from_string(const char *str, struct ssam_device_uid *uid)
+{
+	u8 d, tc, tid, iid, fn;
+	int n;
+
+	n = sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx", &d, &tc, &tid, &iid, &fn);
+	if (n != 5)
+		return -EINVAL;
+
+	uid->domain = d;
+	uid->category = tc;
+	uid->target = tid;
+	uid->instance = iid;
+	uid->function = fn;
+
+	return 0;
+}
+
+static int ssam_get_uid_for_node(struct fwnode_handle *node, struct ssam_device_uid *uid)
+{
+	const char *str = fwnode_get_name(node);
+
+	/*
+	 * To simplify definitions of firmware nodes, we set the device name
+	 * based on the UID of the device, prefixed with "ssam:".
+	 */
+	if (strncmp(str, "ssam:", strlen("ssam:")) != 0)
+		return -ENODEV;
+
+	str += strlen("ssam:");
+	return ssam_device_uid_from_string(str, uid);
+}
+
+static int ssam_add_client_device(struct device *parent, struct ssam_controller *ctrl,
+				  struct fwnode_handle *node)
+{
+	struct ssam_device_uid uid;
+	struct ssam_device *sdev;
+	int status;
+
+	status = ssam_get_uid_for_node(node, &uid);
+	if (status)
+		return status;
+
+	sdev = ssam_device_alloc(ctrl, uid);
+	if (!sdev)
+		return -ENOMEM;
+
+	sdev->dev.parent = parent;
+	sdev->dev.fwnode = fwnode_handle_get(node);
+
+	status = ssam_device_add(sdev);
+	if (status)
+		ssam_device_put(sdev);
+
+	return status;
+}
+
+/**
+ * __ssam_register_clients() - Register client devices defined under the
+ * given firmware node as children of the given device.
+ * @parent: The parent device under which clients should be registered.
+ * @ctrl: The controller with which client should be registered.
+ * @node: The firmware node holding definitions of the devices to be added.
+ *
+ * Register all clients that have been defined as children of the given root
+ * firmware node as children of the given parent device. The respective child
+ * firmware nodes will be associated with the correspondingly created child
+ * devices.
+ *
+ * The given controller will be used to instantiate the new devices. See
+ * ssam_device_add() for details.
+ *
+ * Note that, generally, the use of either ssam_device_register_clients() or
+ * ssam_register_clients() should be preferred as they directly use the
+ * firmware node and/or controller associated with the given device. This
+ * function is only intended for use when different device specifications (e.g.
+ * ACPI and firmware nodes) need to be combined (as is done in the platform hub
+ * of the device registry).
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+			    struct fwnode_handle *node)
+{
+	struct fwnode_handle *child;
+	int status;
+
+	fwnode_for_each_child_node(node, child) {
+		/*
+		 * Try to add the device specified in the firmware node. If
+		 * this fails with -ENODEV, the node does not specify any SSAM
+		 * device, so ignore it and continue with the next one.
+		 */
+		status = ssam_add_client_device(parent, ctrl, child);
+		if (status && status != -ENODEV)
+			goto err;
+	}
+
+	return 0;
+err:
+	ssam_remove_clients(parent);
+	return status;
+}
+EXPORT_SYMBOL_GPL(__ssam_register_clients);
+
 static int ssam_remove_device(struct device *dev, void *_data)
 {
 	struct ssam_device *sdev = to_ssam_device(dev);
@@ -387,19 +520,3 @@ void ssam_remove_clients(struct device *dev)
 	device_for_each_child_reverse(dev, NULL, ssam_remove_device);
 }
 EXPORT_SYMBOL_GPL(ssam_remove_clients);
-
-/**
- * ssam_bus_register() - Register and set-up the SSAM client device bus.
- */
-int ssam_bus_register(void)
-{
-	return bus_register(&ssam_bus_type);
-}
-
-/**
- * ssam_bus_unregister() - Unregister the SSAM client device bus.
- */
-void ssam_bus_unregister(void)
-{
-	return bus_unregister(&ssam_bus_type);
-}
diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c
index f1c5905f1c16..c680792a037e 100644
--- a/drivers/platform/surface/surface_aggregator_registry.c
+++ b/drivers/platform/surface/surface_aggregator_registry.c
@@ -286,76 +286,6 @@ static const struct software_node *ssam_node_group_sp8[] = {
 };
 
 
-/* -- Device registry helper functions. ------------------------------------- */
-
-static int ssam_uid_from_string(const char *str, struct ssam_device_uid *uid)
-{
-	u8 d, tc, tid, iid, fn;
-	int n;
-
-	n = sscanf(str, "ssam:%hhx:%hhx:%hhx:%hhx:%hhx", &d, &tc, &tid, &iid, &fn);
-	if (n != 5)
-		return -EINVAL;
-
-	uid->domain = d;
-	uid->category = tc;
-	uid->target = tid;
-	uid->instance = iid;
-	uid->function = fn;
-
-	return 0;
-}
-
-static int ssam_hub_add_device(struct device *parent, struct ssam_controller *ctrl,
-			       struct fwnode_handle *node)
-{
-	struct ssam_device_uid uid;
-	struct ssam_device *sdev;
-	int status;
-
-	status = ssam_uid_from_string(fwnode_get_name(node), &uid);
-	if (status)
-		return status;
-
-	sdev = ssam_device_alloc(ctrl, uid);
-	if (!sdev)
-		return -ENOMEM;
-
-	sdev->dev.parent = parent;
-	sdev->dev.fwnode = node;
-
-	status = ssam_device_add(sdev);
-	if (status)
-		ssam_device_put(sdev);
-
-	return status;
-}
-
-static int ssam_hub_register_clients(struct device *parent, struct ssam_controller *ctrl,
-				     struct fwnode_handle *node)
-{
-	struct fwnode_handle *child;
-	int status;
-
-	fwnode_for_each_child_node(node, child) {
-		/*
-		 * Try to add the device specified in the firmware node. If
-		 * this fails with -EINVAL, the node does not specify any SSAM
-		 * device, so ignore it and continue with the next one.
-		 */
-
-		status = ssam_hub_add_device(parent, ctrl, child);
-		if (status && status != -EINVAL)
-			goto err;
-	}
-
-	return 0;
-err:
-	ssam_remove_clients(parent);
-	return status;
-}
-
-
 /* -- SSAM generic subsystem hub driver framework. -------------------------- */
 
 enum ssam_hub_state {
@@ -385,7 +315,6 @@ struct ssam_hub {
 static void ssam_hub_update_workfn(struct work_struct *work)
 {
 	struct ssam_hub *hub = container_of(work, struct ssam_hub, update_work.work);
-	struct fwnode_handle *node = dev_fwnode(&hub->sdev->dev);
 	enum ssam_hub_state state;
 	int status = 0;
 
@@ -425,7 +354,7 @@ static void ssam_hub_update_workfn(struct work_struct *work)
 	hub->state = state;
 
 	if (hub->state == SSAM_HUB_CONNECTED)
-		status = ssam_hub_register_clients(&hub->sdev->dev, hub->sdev->ctrl, node);
+		status = ssam_device_register_clients(hub->sdev);
 	else
 		ssam_remove_clients(&hub->sdev->dev);
 
@@ -769,7 +698,7 @@ static int ssam_platform_hub_probe(struct platform_device *pdev)
 
 	set_secondary_fwnode(&pdev->dev, root);
 
-	status = ssam_hub_register_clients(&pdev->dev, ctrl, root);
+	status = __ssam_register_clients(&pdev->dev, ctrl, root);
 	if (status) {
 		set_secondary_fwnode(&pdev->dev, NULL);
 		software_node_unregister_node_group(nodes);
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 6cf7e80312d5..46c45d1b6368 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -15,6 +15,7 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/property.h>
 #include <linux/types.h>
 
 #include <linux/surface_aggregator/controller.h>
@@ -375,11 +376,62 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
 /* -- Helpers for controller and hub devices. ------------------------------- */
 
 #ifdef CONFIG_SURFACE_AGGREGATOR_BUS
+
+int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+			    struct fwnode_handle *node);
 void ssam_remove_clients(struct device *dev);
+
 #else /* CONFIG_SURFACE_AGGREGATOR_BUS */
+
+static inline int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl,
+					  struct fwnode_handle *node)
+{
+	return 0;
+}
+
 static inline void ssam_remove_clients(struct device *dev) {}
+
 #endif /* CONFIG_SURFACE_AGGREGATOR_BUS */
 
+/**
+ * ssam_register_clients() - Register all client devices defined under the
+ * given parent device.
+ * @dev: The parent device under which clients should be registered.
+ * @ctrl: The controller with which client should be registered.
+ *
+ * Register all clients that have via firmware nodes been defined as children
+ * of the given (parent) device. The respective child firmware nodes will be
+ * associated with the correspondingly created child devices.
+ *
+ * The given controller will be used to instantiate the new devices. See
+ * ssam_device_add() for details.
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+static inline int ssam_register_clients(struct device *dev, struct ssam_controller *ctrl)
+{
+	return __ssam_register_clients(dev, ctrl, dev_fwnode(dev));
+}
+
+/**
+ * ssam_device_register_clients() - Register all client devices defined under
+ * the given SSAM parent device.
+ * @sdev: The parent device under which clients should be registered.
+ *
+ * Register all clients that have via firmware nodes been defined as children
+ * of the given (parent) device. The respective child firmware nodes will be
+ * associated with the correspondingly created child devices.
+ *
+ * The controller used by the parent device will be used to instantiate the new
+ * devices. See ssam_device_add() for details.
+ *
+ * Return: Returns zero on success, nonzero on failure.
+ */
+static inline int ssam_device_register_clients(struct ssam_device *sdev)
+{
+	return ssam_register_clients(&sdev->dev, sdev->ctrl);
+}
+
 
 /* -- Helpers for client-device requests. ----------------------------------- */
 
-- 
cgit v1.2.3


From 507db7927cd181d409dd495c8384b8e14c21c600 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Sun, 3 Jul 2022 18:08:36 -0700
Subject: mm: rmap: use the correct parameter name for DEFINE_PAGE_VMA_WALK

The parameter used by DEFINE_PAGE_VMA_WALK is _page not page, fix the
parameter name.  It didn't cause any build error, it is probably because
the only caller is write_protect_page() from ksm.c, which pass in page.

Link: https://lkml.kernel.org/r/20220512174551.81279-1-shy828301@gmail.com
Fixes: 2aff7a4755be ("mm: Convert page_vma_mapped_walk to work on PFNs")
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9ec23138e410..bf80adca980b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -325,8 +325,8 @@ struct page_vma_mapped_walk {
 #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)	\
 	struct page_vma_mapped_walk name = {				\
 		.pfn = page_to_pfn(_page),				\
-		.nr_pages = compound_nr(page),				\
-		.pgoff = page_to_pgoff(page),				\
+		.nr_pages = compound_nr(_page),				\
+		.pgoff = page_to_pgoff(_page),				\
 		.vma = _vma,						\
 		.address = _address,					\
 		.flags = _flags,					\
-- 
cgit v1.2.3


From c453d8c7d1384d7e1d7f26d3ec0d527092edf801 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Fri, 13 May 2022 12:17:05 -0700
Subject: mm/page_vma_mapped.c: check possible huge PMD map with
 transhuge_vma_suitable()

IIUC page_vma_mapped_walk() checks if the vma is possibly huge PMD mapped
with transparent_hugepage_active() and "pvmw->nr_pages >= HPAGE_PMD_NR".

Actually pvmw->nr_pages is returned by compound_nr() or folio_nr_pages(),
so the page should be THP as long as "pvmw->nr_pages >= HPAGE_PMD_NR".
And it is guaranteed THP is allocated for valid VMA in the first place.
But it may be not PMD mapped if the VMA is file VMA and it is not properly
aligned.  The transhuge_vma_suitable() is used to do such check, so
replace transparent_hugepage_active() to it, which is too heavy and
overkilling.

Link: https://lkml.kernel.org/r/20220513191705.457775-1-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 8 ++++++--
 mm/page_vma_mapped.c    | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de29821231c9..648cb3ce7099 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -117,8 +117,10 @@ extern struct kobj_attribute shmem_enabled_attr;
 extern unsigned long transparent_hugepage_flags;
 
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
+		unsigned long addr)
 {
+	unsigned long haddr;
+
 	/* Don't have to check pgoff for anonymous vma */
 	if (!vma_is_anonymous(vma)) {
 		if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -126,6 +128,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 			return false;
 	}
 
+	haddr = addr & HPAGE_PMD_MASK;
+
 	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
 		return false;
 	return true;
@@ -342,7 +346,7 @@ static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
 }
 
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
+		unsigned long addr)
 {
 	return false;
 }
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..e971a467fcdf 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -243,7 +243,7 @@ restart:
 			 * cleared *pmd but not decremented compound_mapcount().
 			 */
 			if ((pvmw->flags & PVMW_SYNC) &&
-			    transparent_hugepage_active(vma) &&
+			    transhuge_vma_suitable(vma, pvmw->address) &&
 			    (pvmw->nr_pages >= HPAGE_PMD_NR)) {
 				spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
 
-- 
cgit v1.2.3


From 7ce82f4c3f3ead13a9d9498768e3b1a79975c4d8 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 30 May 2022 19:30:15 +0800
Subject: mm/migration: return errno when isolate_huge_page failed

We might fail to isolate huge page due to e.g.  the page is under
migration which cleared HPageMigratable.  We should return errno in this
case rather than always return 1 which could confuse the user, i.e.  the
caller might think all of the memory is migrated while the hugetlb page is
left behind.  We make the prototype of isolate_huge_page consistent with
isolate_lru_page as suggested by Huang Ying and rename isolate_huge_page
to isolate_hugetlb as suggested by Muchun to improve the readability.

Link: https://lkml.kernel.org/r/20220530113016.16663-4-linmiaohe@huawei.com
Fixes: e8db67eb0ded ("mm: migrate: move_pages() supports thp migration")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Suggested-by: Huang Ying <ying.huang@intel.com>
Reported-by: kernel test robot <lkp@intel.com> (build error)
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 +++---
 mm/gup.c                |  2 +-
 mm/hugetlb.c            | 11 +++++------
 mm/memory-failure.c     |  2 +-
 mm/memory_hotplug.c     |  2 +-
 mm/mempolicy.c          |  2 +-
 mm/migrate.c            |  7 ++++---
 7 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e4cff27d1198..756b66ff025e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
-bool isolate_huge_page(struct page *page, struct list_head *list);
+int isolate_hugetlb(struct page *page, struct list_head *list);
 int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
 void putback_active_hugepage(struct page *page);
@@ -376,9 +376,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 	return NULL;
 }
 
-static inline bool isolate_huge_page(struct page *page, struct list_head *list)
+static inline int isolate_hugetlb(struct page *page, struct list_head *list)
 {
-	return false;
+	return -EBUSY;
 }
 
 static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
diff --git a/mm/gup.c b/mm/gup.c
index 407a81d5ca03..3129b754ade3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1930,7 +1930,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 		 * Try to move out any movable page before pinning the range.
 		 */
 		if (folio_test_hugetlb(folio)) {
-			if (!isolate_huge_page(&folio->page,
+			if (isolate_hugetlb(&folio->page,
 						&movable_page_list))
 				isolation_error_count++;
 			continue;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b36a4ef87a2e..dd9a46ccb79c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2766,8 +2766,7 @@ retry:
 		 * Fail with -EBUSY if not possible.
 		 */
 		spin_unlock_irq(&hugetlb_lock);
-		if (!isolate_huge_page(old_page, list))
-			ret = -EBUSY;
+		ret = isolate_hugetlb(old_page, list);
 		spin_lock_irq(&hugetlb_lock);
 		goto free_new;
 	} else if (!HPageFreed(old_page)) {
@@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
 	if (hstate_is_gigantic(h))
 		return -ENOMEM;
 
-	if (page_count(head) && isolate_huge_page(head, list))
+	if (page_count(head) && !isolate_hugetlb(head, list))
 		ret = 0;
 	else if (!page_count(head))
 		ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -6960,15 +6959,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 }
 
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
 {
-	bool ret = true;
+	int ret = 0;
 
 	spin_lock_irq(&hugetlb_lock);
 	if (!PageHeadHuge(page) ||
 	    !HPageMigratable(page) ||
 	    !get_page_unless_zero(page)) {
-		ret = false;
+		ret = -EBUSY;
 		goto unlock;
 	}
 	ClearHPageMigratable(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index da39ec8afca8..845369f839e1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2178,7 +2178,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
 	bool lru = PageLRU(page);
 
 	if (PageHuge(page)) {
-		isolated = isolate_huge_page(page, pagelist);
+		isolated = !isolate_hugetlb(page, pagelist);
 	} else {
 		if (lru)
 			isolated = !isolate_lru_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f1a730c4499..84990a14d51a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 
 		if (PageHuge(page)) {
 			pfn = page_to_pfn(head) + compound_nr(head) - 1;
-			isolate_huge_page(head, &source);
+			isolate_hugetlb(head, &source);
 			continue;
 		} else if (PageTransHuge(page))
 			pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d39b01fd52fe..9689919a2829 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 	if (flags & (MPOL_MF_MOVE_ALL) ||
 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
-		if (!isolate_huge_page(page, qp->pagelist) &&
+		if (isolate_hugetlb(page, qp->pagelist) &&
 			(flags & MPOL_MF_STRICT))
 			/*
 			 * Failed to isolate page but allow migrating pages
diff --git a/mm/migrate.c b/mm/migrate.c
index c83b3ae2e285..1d036dec1328 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -133,7 +133,7 @@ static void putback_movable_page(struct page *page)
  *
  * This function shall be used whenever the isolated pageset has been
  * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
  */
 void putback_movable_pages(struct list_head *l)
 {
@@ -1628,8 +1628,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 
 	if (PageHuge(page)) {
 		if (PageHead(page)) {
-			isolate_huge_page(page, pagelist);
-			err = 1;
+			err = isolate_hugetlb(page, pagelist);
+			if (!err)
+				err = 1;
 		}
 	} else {
 		struct page *head;
-- 
cgit v1.2.3


From ad1ac596e8a8c4b06715dfbd89853eb73c9886b2 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 30 May 2022 19:30:16 +0800
Subject: mm/migration: fix potential pte_unmap on an not mapped pte

__migration_entry_wait and migration_entry_wait_on_locked assume pte is
always mapped from caller.  But this is not the case when it's called from
migration_entry_wait_huge and follow_huge_pmd.  Add a hugetlbfs variant
that calls hugetlb_migration_entry_wait(ptep == NULL) to fix this issue.

Link: https://lkml.kernel.org/r/20220530113016.16663-5-linmiaohe@huawei.com
Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h | 12 ++++++++----
 mm/hugetlb.c            |  4 ++--
 mm/migrate.c            | 23 +++++++++++++++++++----
 3 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index f24775b41880..bb7afd03a324 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
-extern void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte);
+#ifdef CONFIG_HUGETLB_PAGE
+extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
+#endif
 #else
 static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
 {
@@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
-static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte) { }
+#ifdef CONFIG_HUGETLB_PAGE
+static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
+#endif
 static inline int is_writable_migration_entry(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd9a46ccb79c..ed202d29ca46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5702,7 +5702,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
-			migration_entry_wait_huge(vma, mm, ptep);
+			migration_entry_wait_huge(vma, ptep);
 			return 0;
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
@@ -6927,7 +6927,7 @@ retry:
 	} else {
 		if (is_hugetlb_entry_migration(pte)) {
 			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			__migration_entry_wait_huge((pte_t *)pmd, ptl);
 			goto retry;
 		}
 		/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 1d036dec1328..7934eebf1689 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -315,13 +315,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 	__migration_entry_wait(mm, ptep, ptl);
 }
 
-void migration_entry_wait_huge(struct vm_area_struct *vma,
-		struct mm_struct *mm, pte_t *pte)
+#ifdef CONFIG_HUGETLB_PAGE
+void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
-	__migration_entry_wait(mm, pte, ptl);
+	pte_t pte;
+
+	spin_lock(ptl);
+	pte = huge_ptep_get(ptep);
+
+	if (unlikely(!is_hugetlb_entry_migration(pte)))
+		spin_unlock(ptl);
+	else
+		migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
 }
 
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
+{
+	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
+
+	__migration_entry_wait_huge(pte, ptl);
+}
+#endif
+
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
 void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 {
-- 
cgit v1.2.3


From c9e124e0382d83d458db204f929002ea98daa6a8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Jun 2022 18:23:06 +0000
Subject: mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h

The function for knowing if given monitoring context's targets will have
pid or not is defined and used in dbgfs only.  However, the logic is also
needed for sysfs.  This commit moves the code to damon.h and makes both
dbgfs and sysfs to use it.

Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  6 ++++++
 mm/damon/dbgfs.c      | 15 +++++----------
 mm/damon/sysfs.c      |  8 +++-----
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 2765c7d99beb..b9aae19fab3e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id);
 int damon_register_ops(struct damon_operations *ops);
 int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
 
+static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
+{
+	return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
+}
+
+
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
 
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a0dab8b5e45f..5ae810927309 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -275,11 +275,6 @@ out:
 	return ret;
 }
 
-static inline bool target_has_pid(const struct damon_ctx *ctx)
-{
-	return ctx->ops.id == DAMON_OPS_VADDR;
-}
-
 static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 {
 	struct damon_target *t;
@@ -288,7 +283,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
 	int rc;
 
 	damon_for_each_target(t, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			/* Show pid numbers to debugfs users */
 			id = pid_vnr(t->pid);
 		else
@@ -415,7 +410,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -425,11 +420,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
 		if (!t) {
 			damon_for_each_target_safe(t, next, ctx)
 				damon_destroy_target(t);
-			if (target_has_pid(ctx))
+			if (damon_target_has_pid(ctx))
 				dbgfs_put_pids(pids, nr_targets);
 			return -ENOMEM;
 		}
-		if (target_has_pid(ctx))
+		if (damon_target_has_pid(ctx))
 			t->pid = pids[i];
 		damon_add_target(ctx, t);
 	}
@@ -722,7 +717,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 
-	if (!target_has_pid(ctx))
+	if (!damon_target_has_pid(ctx))
 		return;
 
 	mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 09f9e8ca3d1f..8810e6abdb06 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2136,8 +2136,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	struct damon_target *t, *next;
 
 	damon_for_each_target_safe(t, next, ctx) {
-		if (ctx->ops.id == DAMON_OPS_VADDR ||
-				ctx->ops.id == DAMON_OPS_FVADDR)
+		if (damon_target_has_pid(ctx))
 			put_pid(t->pid);
 		damon_destroy_target(t);
 	}
@@ -2181,8 +2180,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 
 	if (!t)
 		return -ENOMEM;
-	if (ctx->ops.id == DAMON_OPS_VADDR ||
-			ctx->ops.id == DAMON_OPS_FVADDR) {
+	if (damon_target_has_pid(ctx)) {
 		t->pid = find_get_pid(sys_target->pid);
 		if (!t->pid)
 			goto destroy_targets_out;
@@ -2210,7 +2208,7 @@ static struct damon_target *damon_sysfs_existing_target(
 	struct pid *pid;
 	struct damon_target *t;
 
-	if (ctx->ops.id == DAMON_OPS_PADDR) {
+	if (!damon_target_has_pid(ctx)) {
 		/* Up to only one target for paddr could exist */
 		damon_for_each_target(t, ctx)
 			return t;
-- 
cgit v1.2.3


From d9da8f6cf55eeca642c021912af1890002464c64 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 9 Jun 2022 20:18:46 +0200
Subject: mm: introduce clear_highpage_kasan_tagged

Add a clear_highpage_kasan_tagged() helper that does clear_highpage() on a
page potentially tagged by KASAN.

This helper is used by the following patch.

Link: https://lkml.kernel.org/r/4471979b46b2c487787ddcd08b9dc5fedd1b6ffd.1654798516.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 10 ++++++++++
 mm/page_alloc.c         |  8 ++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index fee9835e3793..22379a63e293 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page)
 	kunmap_local(kaddr);
 }
 
+static inline void clear_highpage_kasan_tagged(struct page *page)
+{
+	u8 tag;
+
+	tag = page_kasan_tag(page);
+	page_kasan_tag_reset(page);
+	clear_highpage(page);
+	page_kasan_tag_set(page, tag);
+}
+
 #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
 
 static inline void tag_clear_highpage(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9234863f2488..248469134962 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1302,12 +1302,8 @@ static void kernel_init_pages(struct page *page, int numpages)
 
 	/* s390's use of memset() could override KASAN redzones. */
 	kasan_disable_current();
-	for (i = 0; i < numpages; i++) {
-		u8 tag = page_kasan_tag(page + i);
-		page_kasan_tag_reset(page + i);
-		clear_highpage(page + i);
-		page_kasan_tag_set(page + i, tag);
-	}
+	for (i = 0; i < numpages; i++)
+		clear_highpage_kasan_tagged(page + i);
 	kasan_enable_current();
 }
 
-- 
cgit v1.2.3


From c15187a4a2d660bf490f7873afd0de5288f65c8f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:22 -0700
Subject: mm: memcontrol: introduce mem_cgroup_ino() and
 mem_cgroup_get_from_ino()

Patch series "mm: introduce shrinker debugfs interface", v5.

The only existing debugging mechanism is a couple of tracepoints in
do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end.  They
aren't covering everything though: shrinkers which report 0 objects will
never show up, there is no support for memcg-aware shrinkers.  Shrinkers
are identified by their scan function, which is not always enough (e.g.
hard to guess which super block's shrinker it is having only
"super_cache_scan").

To provide a better visibility and debug options for memory shrinkers this
patchset introduces a /sys/kernel/debug/shrinker interface, to some extent
similar to /sys/kernel/slab.

For each shrinker registered in the system a directory is created.  As
now, the directory will contain only a "scan" file, which allows to get
the number of managed objects for each memory cgroup (for memcg-aware
shrinkers) and each numa node (for numa-aware shrinkers on a numa
machine).  Other interfaces might be added in the future.

To make debugging more pleasant, the patchset also names all shrinkers, so
that debugfs entries can have meaningful names.


This patch (of 5):

Shrinker debugfs requires a way to represent memory cgroups without using
full paths, both for displaying information and getting input from a user.

Cgroup inode number is a perfect way, already used by bpf.

This commit adds a couple of helper functions which will be used to handle
memcg-aware shrinkers.

Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev
Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 21 +++++++++++++++++++++
 mm/memcontrol.c            | 23 +++++++++++++++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 04f2f33607e9..4d31ce55b1c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 }
 struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
 
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+	return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
+}
+
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
+#endif
+
 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
 {
 	return mem_cgroup_from_css(seq_css(m));
@@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return NULL;
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
+static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+	return NULL;
+}
+#endif
+
 static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
 {
 	return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 655c09393ad5..1497affe08c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5088,6 +5088,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return idr_find(&mem_cgroup_idr, id);
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+	struct cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+
+	cgrp = cgroup_get_from_id(ino);
+	if (!cgrp)
+		return ERR_PTR(-ENOENT);
+
+	css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+	if (css)
+		memcg = container_of(css, struct mem_cgroup, css);
+	else
+		memcg = ERR_PTR(-ENOENT);
+
+	cgroup_put(cgrp);
+
+	return memcg;
+}
+#endif
+
 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
-- 
cgit v1.2.3


From 5035ebc644aec92d55d1bbfe042f35341e4bffb5 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:23 -0700
Subject: mm: shrinkers: introduce debugfs interface for memory shrinkers

This commit introduces the /sys/kernel/debug/shrinker debugfs interface
which provides an ability to observe the state of individual kernel memory
shrinkers.

Because the feature adds some memory overhead (which shouldn't be large
unless there is a huge amount of registered shrinkers), it's guarded by a
config option (enabled by default).

This commit introduces the "count" interface for each shrinker registered
in the system.

The output is in the following format:
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
<cgroup inode id> <nr of objects on node 0> <nr of objects on node 1>...
...

To reduce the size of output on machines with many thousands cgroups, if
the total number of objects on all nodes is 0, the line is omitted.

If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed as
cgroup inode id.  If the shrinker is not numa-aware, 0's are printed for
all nodes except the first one.

This commit gives debugfs entries simple numeric names, which are not very
convenient.  The following commit in the series will provide shrinkers
with more meaningful names.

[akpm@linux-foundation.org: remove WARN_ON_ONCE(), per Roman]
  Reported-by: syzbot+300d27c79fe6d4cbcc39@syzkaller.appspotmail.com
Link: https://lkml.kernel.org/r/20220601032227.4076670-3-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shrinker.h |  19 +++++-
 lib/Kconfig.debug        |   9 +++
 mm/Makefile              |   1 +
 mm/shrinker_debug.c      | 168 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c              |   6 +-
 5 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 mm/shrinker_debug.c

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 76fbf92b04d9..2ced8149c513 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -72,6 +72,10 @@ struct shrinker {
 #ifdef CONFIG_MEMCG
 	/* ID in shrinker_idr */
 	int id;
+#endif
+#ifdef CONFIG_SHRINKER_DEBUG
+	int debugfs_id;
+	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
 	atomic_long_t *nr_deferred;
@@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
-#endif
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int shrinker_debugfs_add(struct shrinker *shrinker);
+extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+#else /* CONFIG_SHRINKER_DEBUG */
+static inline int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	return 0;
+}
+static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+}
+#endif /* CONFIG_SHRINKER_DEBUG */
+#endif /* _LINUX_SHRINKER_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2e24db4bff19..0b483a8da409 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -699,6 +699,15 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
 	help
 	  Debug objects boot parameter default value
 
+config SHRINKER_DEBUG
+	default y
+	bool "Enable shrinker debugging support"
+	depends on DEBUG_FS
+	help
+	  Say Y to enable the shrinker debugfs interface which provides
+	  visibility into the kernel memory shrinkers subsystem.
+	  Disable it to avoid an extra memory footprint.
+
 config HAVE_DEBUG_KMEMLEAK
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..9a564f836403 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
 obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..1a70556bd46c
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+					    struct mem_cgroup *memcg,
+					    unsigned long *count_per_node)
+{
+	unsigned long nr, total = 0;
+	int nid;
+
+	for_each_node(nid) {
+		if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+			struct shrink_control sc = {
+				.gfp_mask = GFP_KERNEL,
+				.nid = nid,
+				.memcg = memcg,
+			};
+
+			nr = shrinker->count_objects(shrinker, &sc);
+			if (nr == SHRINK_EMPTY)
+				nr = 0;
+		} else {
+			nr = 0;
+		}
+
+		count_per_node[nid] = nr;
+		total += nr;
+	}
+
+	return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+	struct shrinker *shrinker = m->private;
+	unsigned long *count_per_node;
+	struct mem_cgroup *memcg;
+	unsigned long total;
+	bool memcg_aware;
+	int ret, nid;
+
+	count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+	if (!count_per_node)
+		return -ENOMEM;
+
+	ret = down_read_killable(&shrinker_rwsem);
+	if (ret) {
+		kfree(count_per_node);
+		return ret;
+	}
+	rcu_read_lock();
+
+	memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		if (memcg && !mem_cgroup_online(memcg))
+			continue;
+
+		total = shrinker_count_objects(shrinker,
+					       memcg_aware ? memcg : NULL,
+					       count_per_node);
+		if (total) {
+			seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+			for_each_node(nid)
+				seq_printf(m, " %lu", count_per_node[nid]);
+			seq_putc(m, '\n');
+		}
+
+		if (!memcg_aware) {
+			mem_cgroup_iter_break(NULL, memcg);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			mem_cgroup_iter_break(NULL, memcg);
+			ret = -EINTR;
+			break;
+		}
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+	rcu_read_unlock();
+	up_read(&shrinker_rwsem);
+
+	kfree(count_per_node);
+	return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+	struct dentry *entry;
+	char buf[16];
+	int id;
+
+	lockdep_assert_held(&shrinker_rwsem);
+
+	/* debugfs isn't initialized yet, add debugfs entries later. */
+	if (!shrinker_debugfs_root)
+		return 0;
+
+	id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+	if (id < 0)
+		return id;
+	shrinker->debugfs_id = id;
+
+	snprintf(buf, sizeof(buf), "%d", id);
+
+	/* create debugfs entry */
+	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+	if (IS_ERR(entry)) {
+		ida_free(&shrinker_debugfs_ida, id);
+		return PTR_ERR(entry);
+	}
+	shrinker->debugfs_entry = entry;
+
+	debugfs_create_file("count", 0220, entry, shrinker,
+			    &shrinker_debugfs_count_fops);
+	return 0;
+}
+
+void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+	lockdep_assert_held(&shrinker_rwsem);
+
+	if (!shrinker->debugfs_entry)
+		return;
+
+	debugfs_remove_recursive(shrinker->debugfs_entry);
+	ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+	struct shrinker *shrinker;
+	struct dentry *dentry;
+	int ret = 0;
+
+	dentry = debugfs_create_dir("shrinker", NULL);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	shrinker_debugfs_root = dentry;
+
+	/* Create debugfs entries for shrinkers registered at boot */
+	down_write(&shrinker_rwsem);
+	list_for_each_entry(shrinker, &shrinker_list, list)
+		if (!shrinker->debugfs_entry) {
+			ret = shrinker_debugfs_add(shrinker);
+			if (ret)
+				break;
+		}
+	up_write(&shrinker_rwsem);
+
+	return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7d9a683e3a7..35dedff79eb4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task,
 	task->reclaim_state = rs;
 }
 
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
@@ -650,6 +650,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	shrinker->flags |= SHRINKER_REGISTERED;
+	shrinker_debugfs_add(shrinker);
 	up_write(&shrinker_rwsem);
 }
 
@@ -677,6 +678,7 @@ void unregister_shrinker(struct shrinker *shrinker)
 	shrinker->flags &= ~SHRINKER_REGISTERED;
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 		unregister_memcg_shrinker(shrinker);
+	shrinker_debugfs_remove(shrinker);
 	up_write(&shrinker_rwsem);
 
 	kfree(shrinker->nr_deferred);
-- 
cgit v1.2.3


From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 31 May 2022 20:22:24 -0700
Subject: mm: shrinkers: provide shrinkers with names

Currently shrinkers are anonymous objects.  For debugging purposes they
can be identified by count/scan function names, but it's not always
useful: e.g.  for superblock's shrinkers it's nice to have at least an
idea of to which superblock the shrinker belongs.

This commit adds names to shrinkers.  register_shrinker() and
prealloc_shrinker() functions are extended to take a format and arguments
to master a name.

In some cases it's not possible to determine a good name at the time when
a shrinker is allocated.  For such cases shrinker_debugfs_rename() is
provided.

The expected format is:
    <subsystem>-<shrinker_type>[:<instance>]-<id>
For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair.

After this change the shrinker debugfs directory looks like:
  $ cd /sys/kernel/debug/shrinker/
  $ ls
    dquota-cache-16     sb-devpts-28     sb-proc-47       sb-tmpfs-42
    mm-shadow-18        sb-devtmpfs-5    sb-proc-48       sb-tmpfs-43
    mm-zspool:zram0-34  sb-hugetlbfs-17  sb-pstore-31     sb-tmpfs-44
    rcu-kfree-0         sb-hugetlbfs-33  sb-rootfs-2      sb-tmpfs-49
    sb-aio-20           sb-iomem-12      sb-securityfs-6  sb-tracefs-13
    sb-anon_inodefs-15  sb-mqueue-21     sb-selinuxfs-22  sb-xfs:vda1-36
    sb-bdev-3           sb-nsfs-4        sb-sockfs-8      sb-zsmalloc-19
    sb-bpf-32           sb-pipefs-14     sb-sysfs-26      thp-deferred_split-10
    sb-btrfs:vda2-24    sb-proc-25       sb-tmpfs-1       thp-zero-9
    sb-cgroup2-30       sb-proc-39       sb-tmpfs-27      xfs-buf:vda1-37
    sb-configfs-23      sb-proc-41       sb-tmpfs-29      xfs-inodegc:vda1-38
    sb-dax-11           sb-proc-45       sb-tmpfs-35
    sb-debugfs-7        sb-proc-46       sb-tmpfs-40

[roman.gushchin@linux.dev: fix build warnings]
  Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle
  Reported-by: kernel test robot <lkp@intel.com>
Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kvm/mmu/mmu.c                           |  2 +-
 drivers/android/binder_alloc.c                   |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c     |  3 +-
 drivers/gpu/drm/msm/msm_gem_shrinker.c           |  2 +-
 drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c |  2 +-
 drivers/gpu/drm/ttm/ttm_pool.c                   |  2 +-
 drivers/md/bcache/btree.c                        |  2 +-
 drivers/md/dm-bufio.c                            |  3 +-
 drivers/md/dm-zoned-metadata.c                   |  4 +-
 drivers/md/raid5.c                               |  2 +-
 drivers/misc/vmw_balloon.c                       |  2 +-
 drivers/virtio/virtio_balloon.c                  |  2 +-
 drivers/xen/xenbus/xenbus_probe_backend.c        |  2 +-
 fs/btrfs/super.c                                 |  2 +
 fs/erofs/utils.c                                 |  2 +-
 fs/ext4/extents_status.c                         |  3 +-
 fs/f2fs/super.c                                  |  2 +-
 fs/gfs2/glock.c                                  |  2 +-
 fs/gfs2/main.c                                   |  2 +-
 fs/jbd2/journal.c                                |  3 +-
 fs/mbcache.c                                     |  2 +-
 fs/nfs/nfs42xattr.c                              |  7 +--
 fs/nfs/super.c                                   |  2 +-
 fs/nfsd/filecache.c                              |  2 +-
 fs/nfsd/nfscache.c                               |  3 +-
 fs/quota/dquot.c                                 |  2 +-
 fs/super.c                                       |  6 ++-
 fs/ubifs/super.c                                 |  2 +-
 fs/xfs/xfs_buf.c                                 |  3 +-
 fs/xfs/xfs_icache.c                              |  2 +-
 fs/xfs/xfs_qm.c                                  |  3 +-
 include/linux/shrinker.h                         | 14 +++++-
 kernel/rcu/tree.c                                |  2 +-
 mm/huge_memory.c                                 |  4 +-
 mm/shrinker_debug.c                              | 47 ++++++++++++++++++-
 mm/vmscan.c                                      | 58 ++++++++++++++++++++++--
 mm/workingset.c                                  |  2 +-
 mm/zsmalloc.c                                    |  3 +-
 net/sunrpc/auth.c                                |  2 +-
 39 files changed, 167 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 17252f39bd7c..797d3286ecc1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6317,7 +6317,7 @@ int kvm_mmu_vendor_module_init(void)
 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
 		goto out;
 
-	ret = register_shrinker(&mmu_shrinker);
+	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
 	if (ret)
 		goto out;
 
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 5649a0371a1f..51b502217d00 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1084,7 +1084,7 @@ int binder_alloc_shrinker_init(void)
 	int ret = list_lru_init(&binder_alloc_lru);
 
 	if (ret == 0) {
-		ret = register_shrinker(&binder_shrinker);
+		ret = register_shrinker(&binder_shrinker, "android-binder");
 		if (ret)
 			list_lru_destroy(&binder_alloc_lru);
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index 6a6ff98a8746..e43577e03067 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915)
 	i915->mm.shrinker.count_objects = i915_gem_shrinker_count;
 	i915->mm.shrinker.seeks = DEFAULT_SEEKS;
 	i915->mm.shrinker.batch = 4096;
-	drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker));
+	drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker,
+						  "drm-i915_gem"));
 
 	i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
 	drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier));
diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
index 086dacf2f26a..26e84d2ea6ae 100644
--- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
+++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
@@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev)
 	priv->shrinker.count_objects = msm_gem_shrinker_count;
 	priv->shrinker.scan_objects = msm_gem_shrinker_scan;
 	priv->shrinker.seeks = DEFAULT_SEEKS;
-	WARN_ON(register_shrinker(&priv->shrinker));
+	WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem"));
 
 	priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap;
 	WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier));
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
index 77e7cb6d1ae3..bf0170782f25 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
@@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev)
 	pfdev->shrinker.count_objects = panfrost_gem_shrinker_count;
 	pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan;
 	pfdev->shrinker.seeks = DEFAULT_SEEKS;
-	WARN_ON(register_shrinker(&pfdev->shrinker));
+	WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost"));
 }
 
 /**
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 1bba0a0ed3f9..21b61631f73a 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	mm_shrinker.count_objects = ttm_pool_shrinker_count;
 	mm_shrinker.scan_objects = ttm_pool_shrinker_scan;
 	mm_shrinker.seeks = 1;
-	return register_shrinker(&mm_shrinker);
+	return register_shrinker(&mm_shrinker, "drm-ttm_pool");
 }
 
 /**
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index e136d6edc1ed..147c493a989a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c)
 	c->shrink.seeks = 4;
 	c->shrink.batch = c->btree_pages * 2;
 
-	if (register_shrinker(&c->shrink))
+	if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid))
 		pr_warn("bcache: %s: could not register shrinker\n",
 				__func__);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5ffa1dcf84cf..3ff571b20f14 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
 	c->shrinker.batch = 0;
-	r = register_shrinker(&c->shrinker);
+	r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name,
+			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 	if (r)
 		goto bad;
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d1ea66114d14..46648f6100fb 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
 	zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
 
 	/* Metadata cache shrinker */
-	ret = register_shrinker(&zmd->mblk_shrinker);
+	ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)",
+				MAJOR(dev->bdev->bd_dev),
+				MINOR(dev->bdev->bd_dev));
 	if (ret) {
 		dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
 		goto err;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d09256d7f81..780ae66840b7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->shrinker.count_objects = raid5_cache_count;
 	conf->shrinker.batch = 128;
 	conf->shrinker.flags = 0;
-	ret = register_shrinker(&conf->shrinker);
+	ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev));
 	if (ret) {
 		pr_warn("md/raid:%s: couldn't register shrinker.\n",
 			mdname(mddev));
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 086ce77d9074..c2d2fa114e65 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1587,7 +1587,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b)
 	b->shrinker.count_objects = vmballoon_shrinker_count;
 	b->shrinker.seeks = DEFAULT_SEEKS;
 
-	r = register_shrinker(&b->shrinker);
+	r = register_shrinker(&b->shrinker, "vmw-balloon");
 
 	if (r == 0)
 		b->shrinker_registered = true;
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b9737da6c4dd..cba57b1f382f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -875,7 +875,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
 	vb->shrinker.count_objects = virtio_balloon_shrinker_count;
 	vb->shrinker.seeks = DEFAULT_SEEKS;
 
-	return register_shrinker(&vb->shrinker);
+	return register_shrinker(&vb->shrinker, "virtio-balloon");
 }
 
 static int virtballoon_probe(struct virtio_device *vdev)
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index 5abded97e1a7..9c09f89d8278 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void)
 
 	register_xenstore_notifier(&xenstore_notifier);
 
-	if (register_shrinker(&backend_memory_shrinker))
+	if (register_shrinker(&backend_memory_shrinker, "xen-backend"))
 		pr_warn("shrinker registration failed\n");
 
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6627dd7875ee..eee3e96d877f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1815,6 +1815,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 			error = -EBUSY;
 	} else {
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+					s->s_id);
 		btrfs_sb(s)->bdev_holder = fs_type;
 		if (!strstr(crc32c_impl(), "generic"))
 			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ec9a1d780dc1..46627cb69abe 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
 
 int __init erofs_init_shrinker(void)
 {
-	return register_shrinker(&erofs_shrinker_info);
+	return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
 }
 
 void erofs_exit_shrinker(void)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..23167efda95e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
 	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
-	err = register_shrinker(&sbi->s_es_shrinker);
+	err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
+				sbi->s_sb->s_id);
 	if (err)
 		goto err4;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37221e94e5ef..bce02306f7a0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4579,7 +4579,7 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_init_sysfs();
 	if (err)
 		goto free_garbage_collection_cache;
-	err = register_shrinker(&f2fs_shrinker_info);
+	err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
 	if (err)
 		goto free_sysfs;
 	err = register_filesystem(&f2fs_fs_type);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c992d53013d3..dca842379cab 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2533,7 +2533,7 @@ int __init gfs2_glock_init(void)
 		return -ENOMEM;
 	}
 
-	ret = register_shrinker(&glock_shrinker);
+	ret = register_shrinker(&glock_shrinker, "gfs2-glock");
 	if (ret) {
 		destroy_workqueue(gfs2_delete_workqueue);
 		destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 244187e3e70f..b66a3e1ec152 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_trans_cachep)
 		goto fail_cachep8;
 
-	error = register_shrinker(&gfs2_qd_shrinker);
+	error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
 	if (error)
 		goto fail_shrinker;
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0cbeeaec2d1..45e4655c8033 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
 		goto err_cleanup;
 
-	if (register_shrinker(&journal->j_shrinker)) {
+	if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+			      MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
 		percpu_counter_destroy(&journal->j_checkpoint_jh_count);
 		goto err_cleanup;
 	}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..0b833da0a9a5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	cache->c_shrink.count_objects = mb_cache_count;
 	cache->c_shrink.scan_objects = mb_cache_scan;
 	cache->c_shrink.seeks = DEFAULT_SEEKS;
-	if (register_shrinker(&cache->c_shrink)) {
+	if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
 		kfree(cache->c_hash);
 		kfree(cache);
 		goto err_out;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index e7b34f7e0614..a9bf09fdf2c3 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
 	if (ret)
 		goto out2;
 
-	ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+	ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
 	if (ret)
 		goto out1;
 
-	ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+	ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
 	if (ret)
 		goto out;
 
-	ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+	ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
+				"nfs-xattr_large_entry");
 	if (!ret)
 		return 0;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6ab5eeb000dc..82944e14fcea 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
 	ret = nfs_register_sysctl();
 	if (ret < 0)
 		goto error_2;
-	ret = register_shrinker(&acl_shrinker);
+	ret = register_shrinker(&acl_shrinker, "nfs-acl");
 	if (ret < 0)
 		goto error_3;
 #ifdef CONFIG_NFS_V4_2
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb2d590c036..a605c0e39b09 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -670,7 +670,7 @@ nfsd_file_cache_init(void)
 		goto out_err;
 	}
 
-	ret = register_shrinker(&nfsd_file_shrinker);
+	ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
 	if (ret) {
 		pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
 		goto out_lru;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 7da88bdc0d6c..9b31e1103e7b 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
 	nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
 	nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
 	nn->nfsd_reply_cache_shrinker.seeks = 1;
-	status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+	status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
+				   "nfsd-reply:%s", nn->nfsd_name);
 	if (status)
 		goto out_stats_destroy;
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09d1307959d0..e0b659900e70 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2995,7 +2995,7 @@ static int __init dquot_init(void)
 	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
 		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
 
-	if (register_shrinker(&dqcache_shrinker))
+	if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
 		panic("Cannot register dquot shrinker");
 
 	return 0;
diff --git a/fs/super.c b/fs/super.c
index 60f57c7bc0a6..4fca6657f442 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
 	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
-	if (prealloc_shrinker(&s->s_shrink))
+	if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
 		goto fail;
 	if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
 		goto fail;
@@ -1288,6 +1288,8 @@ int get_tree_bdev(struct fs_context *fc,
 	} else {
 		s->s_mode = mode;
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+					fc->fs_type->name, s->s_id);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, fc);
 		if (error) {
@@ -1363,6 +1365,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	} else {
 		s->s_mode = mode;
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+					fs_type->name, s->s_id);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0978d01b0ea4..d0c9a09988bc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2430,7 +2430,7 @@ static int __init ubifs_init(void)
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
-	err = register_shrinker(&ubifs_shrinker_info);
+	err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
 	if (err)
 		goto out_slab;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bf4e60871068..4aa9c9cf5b6e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1986,7 +1986,8 @@ xfs_alloc_buftarg(
 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
 	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
-	if (register_shrinker(&btp->bt_shrinker))
+	if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+			      mp->m_super->s_id))
 		goto error_pcpu;
 	return btp;
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 5269354b1b69..a1941c8b8630 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -2201,5 +2201,5 @@ xfs_inodegc_register_shrinker(
 	shrink->flags = SHRINKER_NONSLAB;
 	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
 
-	return register_shrinker(shrink);
+	return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index abf08bbf34a9..c31d57453ceb 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -677,7 +677,8 @@ xfs_qm_init_quotainfo(
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
 	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 
-	error = register_shrinker(&qinf->qi_shrinker);
+	error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+				  mp->m_super->s_id);
 	if (error)
 		goto out_free_inos;
 
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 2ced8149c513..08e6054e061f 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -75,6 +75,7 @@ struct shrinker {
 #endif
 #ifdef CONFIG_SHRINKER_DEBUG
 	int debugfs_id;
+	const char *name;
 	struct dentry *debugfs_entry;
 #endif
 	/* objs pending delete, per node */
@@ -92,9 +93,11 @@ struct shrinker {
  */
 #define SHRINKER_NONSLAB	(1 << 3)
 
-extern int prealloc_shrinker(struct shrinker *shrinker);
+extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
+					    const char *fmt, ...);
 extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int register_shrinker(struct shrinker *shrinker);
+extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
+					    const char *fmt, ...);
 extern void unregister_shrinker(struct shrinker *shrinker);
 extern void free_prealloced_shrinker(struct shrinker *shrinker);
 extern void synchronize_shrinkers(void);
@@ -102,6 +105,8 @@ extern void synchronize_shrinkers(void);
 #ifdef CONFIG_SHRINKER_DEBUG
 extern int shrinker_debugfs_add(struct shrinker *shrinker);
 extern void shrinker_debugfs_remove(struct shrinker *shrinker);
+extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
+						  const char *fmt, ...);
 #else /* CONFIG_SHRINKER_DEBUG */
 static inline int shrinker_debugfs_add(struct shrinker *shrinker)
 {
@@ -110,5 +115,10 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker)
 static inline void shrinker_debugfs_remove(struct shrinker *shrinker)
 {
 }
+static inline __printf(2, 3)
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return 0;
+}
 #endif /* CONFIG_SHRINKER_DEBUG */
 #endif /* _LINUX_SHRINKER_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c25ba442044a..4b3bf6ebb1eb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4884,7 +4884,7 @@ static void __init kfree_rcu_batch_init(void)
 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
-	if (register_shrinker(&kfree_rcu_shrinker))
+	if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
 		pr_err("Failed to register kfree_rcu() shrinker!\n");
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9b90a8d7dfa..60d742c33de3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -423,10 +423,10 @@ static int __init hugepage_init(void)
 	if (err)
 		goto err_slab;
 
-	err = register_shrinker(&huge_zero_page_shrinker);
+	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 	if (err)
 		goto err_hzp_shrinker;
-	err = register_shrinker(&deferred_split_shrinker);
+	err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
 	if (err)
 		goto err_split_shrinker;
 
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 1a70556bd46c..781ecbd3d608 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -102,7 +102,7 @@ DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
 int shrinker_debugfs_add(struct shrinker *shrinker)
 {
 	struct dentry *entry;
-	char buf[16];
+	char buf[128];
 	int id;
 
 	lockdep_assert_held(&shrinker_rwsem);
@@ -116,7 +116,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 		return id;
 	shrinker->debugfs_id = id;
 
-	snprintf(buf, sizeof(buf), "%d", id);
+	snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id);
 
 	/* create debugfs entry */
 	entry = debugfs_create_dir(buf, shrinker_debugfs_root);
@@ -131,10 +131,53 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
 	return 0;
 }
 
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+	struct dentry *entry;
+	char buf[128];
+	const char *new, *old;
+	va_list ap;
+	int ret = 0;
+
+	va_start(ap, fmt);
+	new = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+
+	if (!new)
+		return -ENOMEM;
+
+	down_write(&shrinker_rwsem);
+
+	old = shrinker->name;
+	shrinker->name = new;
+
+	if (shrinker->debugfs_entry) {
+		snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
+			 shrinker->debugfs_id);
+
+		entry = debugfs_rename(shrinker_debugfs_root,
+				       shrinker->debugfs_entry,
+				       shrinker_debugfs_root, buf);
+		if (IS_ERR(entry))
+			ret = PTR_ERR(entry);
+		else
+			shrinker->debugfs_entry = entry;
+	}
+
+	up_write(&shrinker_rwsem);
+
+	kfree_const(old);
+
+	return ret;
+}
+EXPORT_SYMBOL(shrinker_debugfs_rename);
+
 void shrinker_debugfs_remove(struct shrinker *shrinker)
 {
 	lockdep_assert_held(&shrinker_rwsem);
 
+	kfree_const(shrinker->name);
+
 	if (!shrinker->debugfs_entry)
 		return;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35dedff79eb4..97ac6c6c026d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -608,7 +608,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
 /*
  * Add a shrinker callback to be called from the vm.
  */
-int prealloc_shrinker(struct shrinker *shrinker)
+static int __prealloc_shrinker(struct shrinker *shrinker)
 {
 	unsigned int size;
 	int err;
@@ -632,8 +632,36 @@ int prealloc_shrinker(struct shrinker *shrinker)
 	return 0;
 }
 
+#ifdef CONFIG_SHRINKER_DEBUG
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	va_list ap;
+	int err;
+
+	va_start(ap, fmt);
+	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+	if (!shrinker->name)
+		return -ENOMEM;
+
+	err = __prealloc_shrinker(shrinker);
+	if (err)
+		kfree_const(shrinker->name);
+
+	return err;
+}
+#else
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return __prealloc_shrinker(shrinker);
+}
+#endif
+
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
+#ifdef CONFIG_SHRINKER_DEBUG
+	kfree_const(shrinker->name);
+#endif
 	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
 		down_write(&shrinker_rwsem);
 		unregister_memcg_shrinker(shrinker);
@@ -654,15 +682,39 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 	up_write(&shrinker_rwsem);
 }
 
-int register_shrinker(struct shrinker *shrinker)
+static int __register_shrinker(struct shrinker *shrinker)
 {
-	int err = prealloc_shrinker(shrinker);
+	int err = __prealloc_shrinker(shrinker);
 
 	if (err)
 		return err;
 	register_shrinker_prepared(shrinker);
 	return 0;
 }
+
+#ifdef CONFIG_SHRINKER_DEBUG
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	va_list ap;
+	int err;
+
+	va_start(ap, fmt);
+	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+	if (!shrinker->name)
+		return -ENOMEM;
+
+	err = __register_shrinker(shrinker);
+	if (err)
+		kfree_const(shrinker->name);
+	return err;
+}
+#else
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+	return __register_shrinker(shrinker);
+}
+#endif
 EXPORT_SYMBOL(register_shrinker);
 
 /*
diff --git a/mm/workingset.c b/mm/workingset.c
index 592569a8974c..a5e84862fc86 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -625,7 +625,7 @@ static int __init workingset_init(void)
 	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
 	       timestamp_bits, max_order, bucket_order);
 
-	ret = prealloc_shrinker(&workingset_shadow_shrinker);
+	ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
 	if (ret)
 		goto err;
 	ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d5fc04385b8..f24b71568e83 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2217,7 +2217,8 @@ static int zs_register_shrinker(struct zs_pool *pool)
 	pool->shrinker.batch = 0;
 	pool->shrinker.seeks = DEFAULT_SEEKS;
 
-	return register_shrinker(&pool->shrinker);
+	return register_shrinker(&pool->shrinker, "mm-zspool:%s",
+				 pool->name);
 }
 
 /**
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 682fcd24bf43..04e7b55fe0d9 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -874,7 +874,7 @@ int __init rpcauth_init_module(void)
 	err = rpc_init_authunix();
 	if (err < 0)
 		goto out1;
-	err = register_shrinker(&rpc_cred_shrinker);
+	err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred");
 	if (err < 0)
 		goto out2;
 	return 0;
-- 
cgit v1.2.3


From 8cdcc532268df0893d9756f537cbce479f4c4831 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:56 +0000
Subject: mm/damon/schemes: add 'LRU_PRIO' DAMOS action

This commit adds a new DAMOS action called 'LRU_PRIO' for the physical
address space.  The action prioritizes pages in the memory regions of the
user-specified target access pattern on their LRU lists.  This is hence
supposed to be used for frequently accessed (hot) memory regions so that
hot pages could be more likely protected under memory pressure.
Internally, it simply calls 'mark_page_accessed()'.

Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/ops-common.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/damon/ops-common.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 5 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b9aae19fab3e..4c64e03e94d8 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -86,6 +86,7 @@ struct damon_target {
  * @DAMOS_PAGEOUT:	Call ``madvise()`` for the region with MADV_PAGEOUT.
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -95,6 +96,7 @@ enum damos_action {
 	DAMOS_PAGEOUT,
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
+	DAMOS_LRU_PRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 10ef20b2003f..b1335de200e7 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 	/* Return coldness of the region */
 	return DAMOS_MAX_SCORE - hotness;
 }
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s)
+{
+	unsigned int max_nr_accesses;
+	int freq_subscore;
+	unsigned int age_in_sec;
+	int age_in_log, age_subscore;
+	unsigned int freq_weight = s->quota.weight_nr_accesses;
+	unsigned int age_weight = s->quota.weight_age;
+	int hotness;
+
+	max_nr_accesses = c->aggr_interval / c->sample_interval;
+	freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+	age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+	for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+			age_in_log++, age_in_sec >>= 1)
+		;
+
+	/* If frequency is 0, higher age means it's colder */
+	if (freq_subscore == 0)
+		age_in_log *= -1;
+
+	/*
+	 * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+	 * Scale it to be in [0, 100] and set it as age subscore.
+	 */
+	age_in_log += DAMON_MAX_AGE_IN_LOG;
+	age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+		DAMON_MAX_AGE_IN_LOG / 2;
+
+	hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+	if (freq_weight + age_weight)
+		hotness /= freq_weight + age_weight;
+	/*
+	 * Transform it to fit in [0, DAMOS_MAX_SCORE]
+	 */
+	hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+	return hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index e790cb5f8fe0..52329ff361cd 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
 
 int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
 			struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+			struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 7bcd48066b43..f145b1d51e13 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -233,6 +233,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		mark_page_accessed(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -240,6 +256,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pa_pageout(r);
+	case DAMOS_LRU_PRIO:
+		return damon_pa_mark_accessed(r);
 	default:
 		break;
 	}
@@ -253,6 +271,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 	switch (scheme->action) {
 	case DAMOS_PAGEOUT:
 		return damon_pageout_score(context, r, scheme);
+	case DAMOS_LRU_PRIO:
+		return damon_hot_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index c35809c6087c..86c69f980927 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -762,6 +762,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"pageout",
 	"hugepage",
 	"nohugepage",
+	"lru_prio",
 	"stat",
 };
 
-- 
cgit v1.2.3


From 99cdc2cd180a7adc87badc9ca92f8af803d8bf3b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 13 Jun 2022 19:22:58 +0000
Subject: mm/damon/schemes: add 'LRU_DEPRIO' action

This commit adds a new DAMON-based operation scheme action called
'LRU_DEPRIO' for physical address space.  The action deprioritizes pages
in the memory area of the target access pattern on their LRU lists.  This
is hence supposed to be used for rarely accessed (cold) memory regions so
that cold pages could be more likely reclaimed first under memory
pressure.  Internally, it simply calls 'lru_deactivate()'.

Using this with 'LRU_PRIO' action for hot pages, users can proactively
sort LRU lists based on the access pattern.  That is, it can make the LRU
lists somewhat more trustworthy source of access temperature.  As a
result, efficiency of LRU-lists based mechanisms including the reclamation
target selection could be improved.

Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/paddr.c      | 20 ++++++++++++++++++++
 mm/damon/sysfs.c      |  1 +
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 4c64e03e94d8..7b1f4a488230 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -87,6 +87,7 @@ struct damon_target {
  * @DAMOS_HUGEPAGE:	Call ``madvise()`` for the region with MADV_HUGEPAGE.
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
+ * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  */
@@ -97,6 +98,7 @@ enum damos_action {
 	DAMOS_HUGEPAGE,
 	DAMOS_NOHUGEPAGE,
 	DAMOS_LRU_PRIO,
+	DAMOS_LRU_DEPRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index f145b1d51e13..dc131c6a5403 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -249,6 +249,22 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+	unsigned long addr, applied = 0;
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct page *page = damon_get_page(PHYS_PFN(addr));
+
+		if (!page)
+			continue;
+		deactivate_page(page);
+		put_page(page);
+		applied++;
+	}
+	return applied * PAGE_SIZE;
+}
+
 static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		struct damon_target *t, struct damon_region *r,
 		struct damos *scheme)
@@ -258,6 +274,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		return damon_pa_pageout(r);
 	case DAMOS_LRU_PRIO:
 		return damon_pa_mark_accessed(r);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pa_deactivate_pages(r);
 	default:
 		break;
 	}
@@ -273,6 +291,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
 		return damon_pageout_score(context, r, scheme);
 	case DAMOS_LRU_PRIO:
 		return damon_hot_score(context, r, scheme);
+	case DAMOS_LRU_DEPRIO:
+		return damon_pageout_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 86c69f980927..7488e27c87c3 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -763,6 +763,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
 	"hugepage",
 	"nohugepage",
 	"lru_prio",
+	"lru_deprio",
 	"stat",
 };
 
-- 
cgit v1.2.3


From 64fe24a3e05e5f3ac56fcd45afd2fd1d9cc8fcb6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 14 Jun 2022 11:36:29 +0200
Subject: mm/mprotect: try avoiding write faults for exclusive anonymous pages
 when changing protection

Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we
can try mapping anonymous pages in a private writable mapping writable if
they are exclusive, the PTE is already dirty, and no special handling
applies.  Mapping the anonymous page writable is essentially the same
thing the write fault handler would do in this case.

Special handling is required for uffd-wp and softdirty tracking, so take
care of that properly.  Also, leave PROT_NONE handling alone for now; in
the future, we could similarly extend the logic in do_numa_page() or use
pte_mk_savedwrite() here.

While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE)
performance, it should also be a valuable optimization for uffd-wp, when
un-protecting.

This has been previously suggested by Peter Collingbourne in [1], relevant
in the context of the Scudo memory allocator, before we had
PageAnonExclusive.

This commit doesn't add the same handling for PMDs (i.e., anonymous THP,
anonymous hugetlb); benchmark results from Andrea indicate that there are
minor performance gains, so it's might still be valuable to streamline
that logic for all anonymous pages in the future.

As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it
to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually
happening.

Micro-benchmark courtesy of Andrea:

===
 #define _GNU_SOURCE
 #include <sys/mman.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <unistd.h>

 #define SIZE (1024*1024*1024)

int main(int argc, char *argv[])
{
	char *p;
	if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE))
		perror("posix_memalign"), exit(1);
	if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE))
		perror("madvise");
	explicit_bzero(p, SIZE);
	for (int loops = 0; loops < 40; loops++) {
		if (mprotect(p, SIZE, PROT_READ))
			perror("mprotect"), exit(1);
		if (mprotect(p, SIZE, PROT_READ|PROT_WRITE))
			perror("mprotect"), exit(1);
		explicit_bzero(p, SIZE);
	}
}
===

Results on my Ryzen 9 3900X:

Stock 10 runs (lower is better):   AVG 6.398s, STDEV 0.043
Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026

===

[1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com

Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Peter Collingbourne <pcc@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  8 ++++--
 mm/mprotect.c      | 77 +++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 68 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..09ea26056e2f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
  * for now all the callers are only use one of the flags at the same
  * time.
  */
-/* Whether we should allow dirty bit accounting */
-#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
+/*
+ * Whether we should manually check if we can map individual PTEs writable,
+ * because something (e.g., COW, uffd-wp) blocks that from happening for all
+ * PTEs automatically in a writable mapping.
+ */
+#define  MM_CP_TRY_CHANGE_WRITABLE	   (1UL << 0)
 /* Whether this protection change is for NUMA hints */
 #define  MM_CP_PROT_NUMA                   (1UL << 1)
 /* Whether this change is for write protecting */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ba5592655ee3..996a97e213ad 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -38,6 +38,39 @@
 
 #include "internal.h"
 
+static inline bool can_change_pte_writable(struct vm_area_struct *vma,
+					   unsigned long addr, pte_t pte)
+{
+	struct page *page;
+
+	VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
+
+	if (pte_protnone(pte) || !pte_dirty(pte))
+		return false;
+
+	/* Do we need write faults for softdirty tracking? */
+	if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte))
+		return false;
+
+	/* Do we need write faults for uffd-wp tracking? */
+	if (userfaultfd_pte_wp(vma, pte))
+		return false;
+
+	if (!(vma->vm_flags & VM_SHARED)) {
+		/*
+		 * We can only special-case on exclusive anonymous pages,
+		 * because we know that our write-fault handler similarly would
+		 * map them writable without any additional checks while holding
+		 * the PT lock.
+		 */
+		page = vm_normal_page(vma, addr, pte);
+		if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+			return false;
+	}
+
+	return true;
+}
+
 static unsigned long change_pte_range(struct mmu_gather *tlb,
 		struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	unsigned long pages = 0;
 	int target_node = NUMA_NO_NODE;
-	bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 				ptent = pte_wrprotect(ptent);
 				ptent = pte_mkuffd_wp(ptent);
 			} else if (uffd_wp_resolve) {
-				/*
-				 * Leave the write bit to be handled
-				 * by PF interrupt handler, then
-				 * things like COW could be properly
-				 * handled.
-				 */
 				ptent = pte_clear_uffd_wp(ptent);
 			}
 
-			/* Avoid taking write faults for known dirty pages */
-			if (dirty_accountable && pte_dirty(ptent) &&
-					(pte_soft_dirty(ptent) ||
-					 !(vma->vm_flags & VM_SOFTDIRTY))) {
+			/*
+			 * In some writable, shared mappings, we might want
+			 * to catch actual write access -- see
+			 * vma_wants_writenotify().
+			 *
+			 * In all writable, private mappings, we have to
+			 * properly handle COW.
+			 *
+			 * In both cases, we can sometimes still change PTEs
+			 * writable and avoid the write-fault handler, for
+			 * example, if a PTE is already dirty and no other
+			 * COW or special handling is required.
+			 */
+			if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+			    !pte_write(ptent) &&
+			    can_change_pte_writable(vma, addr, ptent))
 				ptent = pte_mkwrite(ptent);
-			}
+
 			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
 			if (pte_needs_flush(oldpte, ptent))
 				tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
@@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	bool try_change_writable;
 	pgoff_t pgoff;
 	int error;
-	int dirty_accountable = 0;
 
 	if (newflags == oldflags) {
 		*pprev = vma;
@@ -583,11 +621,20 @@ success:
 	 * held in write mode.
 	 */
 	vma->vm_flags = newflags;
-	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+	/*
+	 * We want to check manually if we can change individual PTEs writable
+	 * if we can't do that automatically for all PTEs in a mapping. For
+	 * private mappings, that's always the case when we have write
+	 * permissions as we properly have to handle COW.
+	 */
+	if (vma->vm_flags & VM_SHARED)
+		try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
+	else
+		try_change_writable = !!(vma->vm_flags & VM_WRITE);
 	vma_set_page_prot(vma);
 
 	change_protection(tlb, vma, start, end, vma->vm_page_prot,
-			  dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+			  try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
 
 	/*
 	 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
-- 
cgit v1.2.3


From b8cecb9376b9d3031cf62b476a0db087b6b01072 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 16:42:44 +0100
Subject: mm/vmscan: convert reclaim_clean_pages_from_list() to folios

Patch series "nvert much of vmscan to folios"

vmscan always operates on folios since it puts the pages on the LRU list.
Switching all of these functions from pages to folios saves 1483 bytes of
text from removing all the baggage around calling compound_page() and
similar functions.


This patch (of 5):

This is a straightforward conversion which removes several hidden calls
to compound_head, saving 330 bytes of kernel text.

Link: https://lkml.kernel.org/r/20220617154248.700416-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20220617154248.700416-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  6 ++++++
 mm/vmscan.c                | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e66f7aa3191d..f32aade2a6e0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -670,6 +670,12 @@ static __always_inline bool PageAnon(struct page *page)
 	return folio_test_anon(page_folio(page));
 }
 
+static __always_inline bool __folio_test_movable(const struct folio *folio)
+{
+	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
+			PAGE_MAPPING_MOVABLE;
+}
+
 static __always_inline int __PageMovable(struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 97ac6c6c026d..2ecca45672e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2041,7 +2041,7 @@ keep:
 }
 
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
-					    struct list_head *page_list)
+					    struct list_head *folio_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -2049,16 +2049,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
-	struct page *page, *next;
-	LIST_HEAD(clean_pages);
+	struct folio *folio, *next;
+	LIST_HEAD(clean_folios);
 	unsigned int noreclaim_flag;
 
-	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (!PageHuge(page) && page_is_file_lru(page) &&
-		    !PageDirty(page) && !__PageMovable(page) &&
-		    !PageUnevictable(page)) {
-			ClearPageActive(page);
-			list_move(&page->lru, &clean_pages);
+	list_for_each_entry_safe(folio, next, folio_list, lru) {
+		if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
+		    !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
+		    !folio_test_unevictable(folio)) {
+			folio_clear_active(folio);
+			list_move(&folio->lru, &clean_folios);
 		}
 	}
 
@@ -2069,11 +2069,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 * change in the future.
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
-	nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+	nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
 					&stat, true);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
-	list_splice(&clean_pages, page_list);
+	list_splice(&clean_folios, folio_list);
 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
 			    -(long)nr_reclaimed);
 	/*
-- 
cgit v1.2.3


From e3c4cebf3f9db8c9150eb1982da7e353d9938bed Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:49:59 +0100
Subject: mm: add folios_put()

Patch series "Convert the swap code to be more folio-based".

There's still more to do with the swap code, but this reaps a lot of the
folio benefit.  More than 4kB of kernel text saved (with the UEK7 kernel
config).  I don't know how much that's going to translate into CPU
savings, but some of those compound_head() calls are on every page free,
so it should be noticable.  It might even be noticable just from an
I-cache consumption perspective.


This patch (of 22):

This is just a wrapper around release_pages() for now.  Place the
prototype in mm.h along with folio_put() and folio_put_refs().

Link: https://lkml.kernel.org/r/20220617175020.717127-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20220617175020.717127-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      | 19 +++++++++++++++++++
 include/linux/pagemap.h |  2 --
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09ea26056e2f..09670ccb94e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1220,6 +1220,25 @@ static inline void folio_put_refs(struct folio *folio, int refs)
 		__put_page(&folio->page);
 }
 
+void release_pages(struct page **pages, int nr);
+
+/**
+ * folios_put - Decrement the reference count on an array of folios.
+ * @folios: The folios.
+ * @nr: How many folios there are.
+ *
+ * Like folio_put(), but for an array of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which
+ * need to be taken if the folios are freed.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void folios_put(struct folio **folios, unsigned int nr)
+{
+	release_pages((struct page **)folios, nr);
+}
+
 static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ce96866fbec4..c399a9c5da7d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 #endif
 }
 
-void release_pages(struct page **pages, int nr);
-
 struct address_space *page_mapping(struct page *);
 struct address_space *folio_mapping(struct folio *);
 struct address_space *swapcache_mapping(struct folio *);
-- 
cgit v1.2.3


From 7d80dd096f8f889128f67a2d452e4dadeed71e63 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:01 +0100
Subject: mm/swap: make __pagevec_lru_add static

__pagevec_lru_add has no callers outside swap.c, so make it static,
and move it to a more logical position in the file.

Link: https://lkml.kernel.org/r/20220617175020.717127-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagevec.h |   1 -
 mm/swap.c               | 126 ++++++++++++++++++++++++------------------------
 2 files changed, 63 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 67b1246f136b..b0e3540f3a4c 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -26,7 +26,6 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
 			      pgoff_t *start, pgoff_t end);
diff --git a/mm/swap.c b/mm/swap.c
index 4265bee41bbd..cab77a5c64c7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -228,6 +228,69 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
 
 typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
 
+static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
+{
+	int was_unevictable = folio_test_clear_unevictable(folio);
+	long nr_pages = folio_nr_pages(folio);
+
+	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+	folio_set_lru(folio);
+	/*
+	 * Is an smp_mb__after_atomic() still required here, before
+	 * folio_evictable() tests PageMlocked, to rule out the possibility
+	 * of stranding an evictable folio on an unevictable LRU?  I think
+	 * not, because __munlock_page() only clears PageMlocked while the LRU
+	 * lock is held.
+	 *
+	 * (That is not true of __page_cache_release(), and not necessarily
+	 * true of release_pages(): but those only clear PageMlocked after
+	 * put_page_testzero() has excluded any other users of the page.)
+	 */
+	if (folio_evictable(folio)) {
+		if (was_unevictable)
+			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+	} else {
+		folio_clear_active(folio);
+		folio_set_unevictable(folio);
+		/*
+		 * folio->mlock_count = !!folio_test_mlocked(folio)?
+		 * But that leaves __mlock_page() in doubt whether another
+		 * actor has already counted the mlock or not.  Err on the
+		 * safe side, underestimate, let page reclaim fix it, rather
+		 * than leaving a page on the unevictable LRU indefinitely.
+		 */
+		folio->mlock_count = 0;
+		if (!was_unevictable)
+			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+	}
+
+	lruvec_add_folio(lruvec, folio);
+	trace_mm_lru_insertion(folio);
+}
+
+/*
+ * Add the passed pages to the LRU, then drop the caller's refcount
+ * on them.  Reinitialises the caller's pagevec.
+ */
+static void __pagevec_lru_add(struct pagevec *pvec)
+{
+	int i;
+	struct lruvec *lruvec = NULL;
+	unsigned long flags = 0;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct folio *folio = page_folio(pvec->pages[i]);
+
+		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+		__pagevec_lru_add_fn(folio, lruvec);
+	}
+	if (lruvec)
+		unlock_page_lruvec_irqrestore(lruvec, flags);
+	release_pages(pvec->pages, pvec->nr);
+	pagevec_reinit(pvec);
+}
+
 static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 {
 	int i;
@@ -1036,69 +1099,6 @@ void __pagevec_release(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_release);
 
-static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
-{
-	int was_unevictable = folio_test_clear_unevictable(folio);
-	long nr_pages = folio_nr_pages(folio);
-
-	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
-
-	folio_set_lru(folio);
-	/*
-	 * Is an smp_mb__after_atomic() still required here, before
-	 * folio_evictable() tests PageMlocked, to rule out the possibility
-	 * of stranding an evictable folio on an unevictable LRU?  I think
-	 * not, because __munlock_page() only clears PageMlocked while the LRU
-	 * lock is held.
-	 *
-	 * (That is not true of __page_cache_release(), and not necessarily
-	 * true of release_pages(): but those only clear PageMlocked after
-	 * put_page_testzero() has excluded any other users of the page.)
-	 */
-	if (folio_evictable(folio)) {
-		if (was_unevictable)
-			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
-	} else {
-		folio_clear_active(folio);
-		folio_set_unevictable(folio);
-		/*
-		 * folio->mlock_count = !!folio_test_mlocked(folio)?
-		 * But that leaves __mlock_page() in doubt whether another
-		 * actor has already counted the mlock or not.  Err on the
-		 * safe side, underestimate, let page reclaim fix it, rather
-		 * than leaving a page on the unevictable LRU indefinitely.
-		 */
-		folio->mlock_count = 0;
-		if (!was_unevictable)
-			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
-	}
-
-	lruvec_add_folio(lruvec, folio);
-	trace_mm_lru_insertion(folio);
-}
-
-/*
- * Add the passed pages to the LRU, then drop the caller's refcount
- * on them.  Reinitialises the caller's pagevec.
- */
-void __pagevec_lru_add(struct pagevec *pvec)
-{
-	int i;
-	struct lruvec *lruvec = NULL;
-	unsigned long flags = 0;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct folio *folio = page_folio(pvec->pages[i]);
-
-		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
-		__pagevec_lru_add_fn(folio, lruvec);
-	}
-	if (lruvec)
-		unlock_page_lruvec_irqrestore(lruvec, flags);
-	release_pages(pvec->pages, pvec->nr);
-	pagevec_reinit(pvec);
-}
-
 /**
  * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
  * @fbatch: The batch to prune
-- 
cgit v1.2.3


From 8d29c7036f5ff360ea1f51b9fed5d909be7c8094 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:13 +0100
Subject: mm/swap: convert __put_page() to __folio_put()

Saves 11 bytes of text by removing a check of PageTail.

Link: https://lkml.kernel.org/r/20220617175020.717127-16-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |  6 +++---
 mm/swap.c            | 14 +++++++-------
 net/core/page_pool.c |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09670ccb94e7..3fb49aec13fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -855,7 +855,7 @@ static inline struct folio *virt_to_folio(const void *x)
 	return page_folio(page);
 }
 
-void __put_page(struct page *page);
+void __folio_put(struct folio *folio);
 
 void put_pages_list(struct list_head *pages);
 
@@ -1197,7 +1197,7 @@ static inline __must_check bool try_get_page(struct page *page)
 static inline void folio_put(struct folio *folio)
 {
 	if (folio_put_testzero(folio))
-		__put_page(&folio->page);
+		__folio_put(folio);
 }
 
 /**
@@ -1217,7 +1217,7 @@ static inline void folio_put(struct folio *folio)
 static inline void folio_put_refs(struct folio *folio, int refs)
 {
 	if (folio_ref_sub_and_test(folio, refs))
-		__put_page(&folio->page);
+		__folio_put(folio);
 }
 
 void release_pages(struct page **pages, int nr);
diff --git a/mm/swap.c b/mm/swap.c
index a5a91aec83da..d09e9ac53809 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -119,16 +119,16 @@ static void __put_compound_page(struct page *page)
 	destroy_compound_page(page);
 }
 
-void __put_page(struct page *page)
+void __folio_put(struct folio *folio)
 {
-	if (unlikely(is_zone_device_page(page)))
-		free_zone_device_page(page);
-	else if (unlikely(PageCompound(page)))
-		__put_compound_page(page);
+	if (unlikely(folio_is_zone_device(folio)))
+		free_zone_device_page(&folio->page);
+	else if (unlikely(folio_test_large(folio)))
+		__put_compound_page(&folio->page);
 	else
-		__put_single_page(page);
+		__put_single_page(&folio->page);
 }
-EXPORT_SYMBOL(__put_page);
+EXPORT_SYMBOL(__folio_put);
 
 /**
  * put_pages_list() - release a list of pages
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f18e6e771993..db70e94c8df2 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -16,7 +16,7 @@
 #include <linux/dma-direction.h>
 #include <linux/dma-mapping.h>
 #include <linux/page-flags.h>
-#include <linux/mm.h> /* for __put_page() */
+#include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
 #include <linux/ethtool.h>
 
-- 
cgit v1.2.3


From 5375336c8c42a343c3b440b6f1e21c65e7b174b9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 17 Jun 2022 18:50:17 +0100
Subject: mm: convert destroy_compound_page() to destroy_large_folio()

All callers now have a folio, so push the folio->page conversion
down to this function.

[akpm@linux-foundation.org: uninline destroy_large_folio() to fix build issue]
Link: https://lkml.kernel.org/r/20220617175020.717127-20-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 6 +-----
 mm/page_alloc.c    | 8 ++++++++
 mm/swap.c          | 2 +-
 mm/vmscan.c        | 4 ++--
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3fb49aec13fd..9cc02a7e503b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -892,11 +892,7 @@ static inline void set_compound_page_dtor(struct page *page,
 	page[1].compound_dtor = compound_dtor;
 }
 
-static inline void destroy_compound_page(struct page *page)
-{
-	VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
-	compound_page_dtors[page[1].compound_dtor](page);
-}
+void destroy_large_folio(struct folio *folio);
 
 static inline int head_compound_pincount(struct page *head)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 248469134962..52fd92b2c1fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -744,6 +744,14 @@ void prep_compound_page(struct page *page, unsigned int order)
 	prep_compound_head(page, order);
 }
 
+void destroy_large_folio(struct folio *folio)
+{
+	enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+	VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+	compound_page_dtors[dtor](&folio->page);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 
diff --git a/mm/swap.c b/mm/swap.c
index 5f6caa651599..1f563d857768 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -115,7 +115,7 @@ static void __folio_put_large(struct folio *folio)
 	 */
 	if (!folio_test_hugetlb(folio))
 		__page_cache_release(folio);
-	destroy_compound_page(&folio->page);
+	destroy_large_folio(folio);
 }
 
 void __folio_put(struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7d3db64a4e0..e660d7205f47 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1979,7 +1979,7 @@ free_it:
 		 * appear not as the counts should be low
 		 */
 		if (unlikely(folio_test_large(folio)))
-			destroy_compound_page(&folio->page);
+			destroy_large_folio(folio);
 		else
 			list_add(&folio->lru, &free_pages);
 		continue;
@@ -2348,7 +2348,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
 
 			if (unlikely(folio_test_large(folio))) {
 				spin_unlock_irq(&lruvec->lru_lock);
-				destroy_compound_page(&folio->page);
+				destroy_large_folio(folio);
 				spin_lock_irq(&lruvec->lru_lock);
 			} else
 				list_add(&folio->lru, &folios_to_free);
-- 
cgit v1.2.3


From ed7802dd48f7a507213cbb95bb4c6f1fe134eb5d Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 17 Jun 2022 21:56:49 +0800
Subject: mm: memory_hotplug: enumerate all supported section flags

Patch series "make hugetlb_optimize_vmemmap compatible with
memmap_on_memory", v3.

This series makes hugetlb_optimize_vmemmap compatible with
memmap_on_memory.


This patch (of 2):

We are almost running out of section flags, only one bit is available in
the worst case (powerpc with 256k pages).  However, there are still some
free bits (in ->section_mem_map) on other architectures (e.g.  x86_64 has
10 bits available, arm64 has 8 bits available with worst case of 64K
pages).  We have hard coded those numbers in code, it is inconvenient to
use those bits on other architectures except powerpc.  So transfer those
section flags to enumeration to make it easy to add new section flags in
the future.  Also, move SECTION_TAINT_ZONE_DEVICE into the scope of
CONFIG_ZONE_DEVICE to save a bit on non-zone-device case.

[songmuchun@bytedance.com: replace enum with defines per David]
  Link: https://lkml.kernel.org/r/20220620110616.12056-2-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++---------
 mm/memory_hotplug.c    |  6 ++++++
 mm/sparse.c            |  2 +-
 3 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aab70355d64f..2b5757752333 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void);
  *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
  *      worst combination is powerpc with 256k pages,
  *      which results in PFN_SECTION_SHIFT equal 6.
- * To sum it up, at least 6 bits are available.
+ * To sum it up, at least 6 bits are available on all architectures.
+ * However, we can exceed 6 bits on some other architectures except
+ * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
+ * with the worst case of 64K pages on arm64) if we make sure the
+ * exceeded bit is not applicable to powerpc.
  */
-#define SECTION_MARKED_PRESENT		(1UL<<0)
-#define SECTION_HAS_MEM_MAP		(1UL<<1)
-#define SECTION_IS_ONLINE		(1UL<<2)
-#define SECTION_IS_EARLY		(1UL<<3)
-#define SECTION_TAINT_ZONE_DEVICE	(1UL<<4)
-#define SECTION_MAP_LAST_BIT		(1UL<<5)
-#define SECTION_MAP_MASK		(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT		6
+enum {
+	SECTION_MARKED_PRESENT_BIT,
+	SECTION_HAS_MEM_MAP_BIT,
+	SECTION_IS_ONLINE_BIT,
+	SECTION_IS_EARLY_BIT,
+#ifdef CONFIG_ZONE_DEVICE
+	SECTION_TAINT_ZONE_DEVICE_BIT,
+#endif
+	SECTION_MAP_LAST_BIT,
+};
+
+#define SECTION_MARKED_PRESENT		BIT(SECTION_MARKED_PRESENT_BIT)
+#define SECTION_HAS_MEM_MAP		BIT(SECTION_HAS_MEM_MAP_BIT)
+#define SECTION_IS_ONLINE		BIT(SECTION_IS_ONLINE_BIT)
+#define SECTION_IS_EARLY		BIT(SECTION_IS_EARLY_BIT)
+#ifdef CONFIG_ZONE_DEVICE
+#define SECTION_TAINT_ZONE_DEVICE	BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
+#endif
+#define SECTION_MAP_MASK		(~(BIT(SECTION_MAP_LAST_BIT) - 1))
+#define SECTION_NID_SHIFT		SECTION_MAP_LAST_BIT
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
@@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_IS_ONLINE));
 }
 
+#ifdef CONFIG_ZONE_DEVICE
 static inline int online_device_section(struct mem_section *section)
 {
 	unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
 
 	return section && ((section->section_mem_map & flags) == flags);
 }
+#else
+static inline int online_device_section(struct mem_section *section)
+{
+	return 0;
+}
+#endif
 
 static inline int online_section_nr(unsigned long nr)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 84990a14d51a..a2a6d280054f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -670,12 +670,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 
 }
 
+#ifdef CONFIG_ZONE_DEVICE
 static void section_taint_zone_device(unsigned long pfn)
 {
 	struct mem_section *ms = __pfn_to_section(pfn);
 
 	ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
 }
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
 
 /*
  * Associate the pfn range with the given zone, initializing the memmaps
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..e5a8a3a0edd7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
 {
 	unsigned long coded_mem_map =
 		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
-	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
 	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
 	return coded_mem_map;
 }
-- 
cgit v1.2.3


From 66361095129b3b5d065e6c09cf0c085ef4a8c40f Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Fri, 17 Jun 2022 21:56:50 +0800
Subject: mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with
 memmap_on_memory

For now, the feature of hugetlb_free_vmemmap is not compatible with the
feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap takes
precedence over memory_hotplug.memmap_on_memory.  However, someone wants
to make memory_hotplug.memmap_on_memory takes precedence over
hugetlb_free_vmemmap since memmap_on_memory makes it more likely to
succeed memory hotplug in close-to-OOM situations.  So the decision of
making hugetlb_free_vmemmap take precedence is not wise and elegant.

The proper approach is to have hugetlb_vmemmap.c do the check whether the
section which the HugeTLB pages belong to can be optimized.  If the
section's vmemmap pages are allocated from the added memory block itself,
hugetlb_free_vmemmap should refuse to optimize the vmemmap, otherwise, do
the optimization.  Then both kernel parameters are compatible.  So this
patch introduces VmemmapSelfHosted to mask any non-optimizable vmemmap
pages.  The hugetlb_vmemmap can use this flag to detect if a vmemmap page
can be optimized.

[songmuchun@bytedance.com: walk vmemmap page tables to avoid false-positive]
  Link: https://lkml.kernel.org/r/20220620110616.12056-3-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20220617135650.74901-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Co-developed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 22 ++++-----
 Documentation/admin-guide/sysctl/vm.rst         |  5 +-
 include/linux/memory_hotplug.h                  |  9 ----
 include/linux/page-flags.h                      | 11 +++++
 mm/hugetlb_vmemmap.c                            | 66 ++++++++++++++++++++++---
 mm/memory_hotplug.c                             | 27 +++++-----
 6 files changed, 93 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8c0ea6b6c6a9..2cacd4f8deb7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1722,9 +1722,11 @@
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
 
-			This is not compatible with memory_hotplug.memmap_on_memory.
-			If both parameters are enabled, hugetlb_free_vmemmap takes
-			precedence over memory_hotplug.memmap_on_memory.
+			Note that the vmemmap pages may be allocated from the added
+			memory block itself when memory_hotplug.memmap_on_memory is
+			enabled, those vmemmap pages cannot be optimized even if this
+			feature is enabled.  Other vmemmap pages not allocated from
+			the added memory block itself do not be affected.
 
 	hung_task_panic=
 			[KNL] Should the hung task detector generate panics.
@@ -3068,10 +3070,12 @@
 			[KNL,X86,ARM] Boolean flag to enable this feature.
 			Format: {on | off (default)}
 			When enabled, runtime hotplugged memory will
-			allocate its internal metadata (struct pages)
-			from the hotadded memory which will allow to
-			hotadd a lot of memory without requiring
-			additional memory to do so.
+			allocate its internal metadata (struct pages,
+			those vmemmap pages cannot be optimized even
+			if hugetlb_free_vmemmap is enabled) from the
+			hotadded memory which will allow to hotadd a
+			lot of memory without requiring additional
+			memory to do so.
 			This feature is disabled by default because it
 			has some implication on large (e.g. GB)
 			allocations in some configurations (e.g. small
@@ -3081,10 +3085,6 @@
 			Note that even when enabled, there are a few cases where
 			the feature is not effective.
 
-			This is not compatible with hugetlb_free_vmemmap. If
-			both parameters are enabled, hugetlb_free_vmemmap takes
-			precedence over memory_hotplug.memmap_on_memory.
-
 	memtest=	[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 4a440a7cfeb0..f74f722ad702 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst
 hugetlb_optimize_vmemmap
 ========================
 
-This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
-is configured or the size of 'struct page' (a structure defined in
-include/linux/mm_types.h) is not power of two (an unusual system config could
+This knob is not available when the size of 'struct page' (a structure defined
+in include/linux/mm_types.h) is not power of two (an unusual system config could
 result in this).
 
 Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d7edf62a6a..e0b2209ab71c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size);
 extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-bool mhp_memmap_on_memory(void);
-#else
-static inline bool mhp_memmap_on_memory(void)
-{
-	return false;
-}
-#endif
-
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f32aade2a6e0..82719d33c0f1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -193,6 +193,11 @@ enum pageflags {
 
 	/* Only valid for buddy pages. Used to track pages that are reported */
 	PG_reported = PG_uptodate,
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* For self-hosted memmap pages */
+	PG_vmemmap_self_hosted = PG_owner_priv_1,
+#endif
 };
 
 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
  */
 __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
+#else
+PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba29c15c53d6..1362feb3c6c9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,7 +10,7 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
-#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
 #include "hugetlb_vmemmap.h"
 
 /*
@@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	return ret;
 }
 
+static unsigned int vmemmap_optimizable_pages(struct hstate *h,
+					      struct page *head)
+{
+	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+		return 0;
+
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+		pmd_t *pmdp, pmd;
+		struct page *vmemmap_page;
+		unsigned long vaddr = (unsigned long)head;
+
+		/*
+		 * Only the vmemmap page's vmemmap page can be self-hosted.
+		 * Walking the page tables to find the backing page of the
+		 * vmemmap page.
+		 */
+		pmdp = pmd_off_k(vaddr);
+		/*
+		 * The READ_ONCE() is used to stabilize *pmdp in a register or
+		 * on the stack so that it will stop changing under the code.
+		 * The only concurrent operation where it can be changed is
+		 * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+		 * operation).
+		 */
+		pmd = READ_ONCE(*pmdp);
+		if (pmd_leaf(pmd))
+			vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+		else
+			vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+		/*
+		 * Due to HugeTLB alignment requirements and the vmemmap pages
+		 * being at the start of the hotplugged memory region in
+		 * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+		 * page's vmemmap page if it is marked as VmemmapSelfHosted is
+		 * sufficient.
+		 *
+		 * [                  hotplugged memory                  ]
+		 * [        section        ][...][        section        ]
+		 * [ vmemmap ][              usable memory               ]
+		 *   ^   |     |                                        |
+		 *   +---+     |                                        |
+		 *     ^       |                                        |
+		 *     +-------+                                        |
+		 *          ^                                           |
+		 *          +-------------------------------------------+
+		 */
+		if (PageVmemmapSelfHosted(vmemmap_page))
+			return 0;
+	}
+
+	return hugetlb_optimize_vmemmap_pages(h);
+}
+
 void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
 {
 	unsigned long vmemmap_addr = (unsigned long)head;
 	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
 
-	vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+	vmemmap_pages = vmemmap_optimizable_pages(h, head);
 	if (!vmemmap_pages)
 		return;
 
-	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
-		return;
-
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
 	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
@@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 static __init int hugetlb_vmemmap_sysctls_init(void)
 {
 	/*
-	 * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
-	 * crosses page boundaries, the vmemmap pages cannot be optimized.
+	 * If "struct page" crosses page boundaries, the vmemmap pages cannot
+	 * be optimized.
 	 */
-	if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+	if (is_power_of_2(sizeof(struct page)))
 		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 
 	return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a2a6d280054f..99ecb2b3ff53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -43,30 +43,22 @@
 #include "shuffle.h"
 
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
-{
-	if (hugetlb_optimize_vmemmap_enabled())
-		return 0;
-	return param_set_bool(val, kp);
-}
-
-static const struct kernel_param_ops memmap_on_memory_ops = {
-	.flags	= KERNEL_PARAM_OPS_FL_NOARG,
-	.set	= memmap_on_memory_set,
-	.get	= param_get_bool,
-};
-
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
 static bool memmap_on_memory __ro_after_init;
-module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
+module_param(memmap_on_memory, bool, 0444);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
 
-bool mhp_memmap_on_memory(void)
+static inline bool mhp_memmap_on_memory(void)
 {
 	return memmap_on_memory;
 }
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+	return false;
+}
 #endif
 
 enum {
@@ -1035,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 			      struct zone *zone)
 {
 	unsigned long end_pfn = pfn + nr_pages;
-	int ret;
+	int ret, i;
 
 	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
 	if (ret)
@@ -1043,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
 
+	for (i = 0; i < nr_pages; i++)
+		SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+
 	/*
 	 * It might be that the vmemmap_pages fully span sections. If that is
 	 * the case, mark those sections online here as otherwise they will be
-- 
cgit v1.2.3


From e8da368a1e42a8056d1a6b419e1b91b6cf11d77e Mon Sep 17 00:00:00 2001
From: Yun-Ze Li <p76091292@gs.ncku.edu.tw>
Date: Mon, 20 Jun 2022 07:15:16 +0000
Subject: mm, docs: fix comments that mention mem_hotplug_end()

Comments that mention mem_hotplug_end() are confusing as there is no
function called mem_hotplug_end().  Fix them by replacing all the
occurences of mem_hotplug_end() in the comments with mem_hotplug_done().

[akpm@linux-foundation.org: grammatical fixes]
Link: https://lkml.kernel.org/r/20220620071516.1286101-1-p76091292@gs.ncku.edu.tw
Signed-off-by: Yun-Ze Li <p76091292@gs.ncku.edu.tw>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 6 +++---
 mm/compaction.c        | 2 +-
 mm/vmscan.c            | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2b5757752333..735bf5b37949 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -591,8 +591,8 @@ struct zone {
 	 * give them a chance of being in the same cacheline.
 	 *
 	 * Write access to present_pages at runtime should be protected by
-	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
-	 * present_pages should get_online_mems() to get a stable value.
+	 * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
+	 * present_pages should use get_online_mems() to get a stable value.
 	 */
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
@@ -870,7 +870,7 @@ typedef struct pglist_data {
 	unsigned long nr_reclaim_start;	/* nr pages written while throttled
 					 * when throttling started. */
 	struct task_struct *kswapd;	/* Protected by
-					   mem_hotplug_begin/end() */
+					   mem_hotplug_begin/done() */
 	int kswapd_order;
 	enum zone_type kswapd_highest_zoneidx;
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f89b969c12b..cd029ab03d0e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -3011,7 +3011,7 @@ void kcompactd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
  */
 void kcompactd_stop(int nid)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 161096d9311a..f58761cea0a0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4645,7 +4645,7 @@ void kswapd_run(int nid)
 
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
  */
 void kswapd_stop(int nid)
 {
-- 
cgit v1.2.3


From 18f3962953e40401b7ed98e8524167282c3e626e Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Sun, 26 Jun 2022 22:57:17 +0800
Subject: mm: hugetlb: kill set_huge_swap_pte_at()

Commit e5251fd43007 ("mm/hugetlb: introduce set_huge_swap_pte_at()
helper") add set_huge_swap_pte_at() to handle swap entries on
architectures that support hugepages consisting of contiguous ptes.  And
currently the set_huge_swap_pte_at() is only overridden by arm64.

set_huge_swap_pte_at() provide a sz parameter to help determine the number
of entries to be updated.  But in fact, all hugetlb swap entries contain
pfn information, so we can find the corresponding folio through the pfn
recorded in the swap entry, then the folio_size() is the number of entries
that need to be updated.

And considering that users will easily cause bugs by ignoring the
difference between set_huge_swap_pte_at() and set_huge_pte_at().  Let's
handle swap entries in set_huge_pte_at() and remove the
set_huge_swap_pte_at(), then we can call set_huge_pte_at() anywhere, which
simplifies our coding.

Link: https://lkml.kernel.org/r/20220626145717.53572-1-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/hugetlb.h |  3 ---
 arch/arm64/mm/hugetlbpage.c      | 34 +++++++++++++++++-----------------
 include/linux/hugetlb.h          | 13 -------------
 mm/hugetlb.c                     |  8 +++-----
 mm/rmap.c                        | 11 +++--------
 5 files changed, 23 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 1fd2846dbefe..d20f5da2d76f 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep, unsigned long sz);
 #define __HAVE_ARCH_HUGE_PTEP_GET
 extern pte_t huge_ptep_get(pte_t *ptep);
-extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-				 pte_t *ptep, pte_t pte, unsigned long sz);
-#define set_huge_swap_pte_at set_huge_swap_pte_at
 
 void __init arm64_hugetlb_cma_reserve(void);
 
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index e2a5ec9fdc0d..3be8f25aa5be 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm,
 	flush_tlb_range(&vma, saddr, addr);
 }
 
+static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry)
+{
+	VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry));
+
+	return page_folio(pfn_to_page(swp_offset(entry)));
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, pte_t pte)
 {
@@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	unsigned long pfn, dpfn;
 	pgprot_t hugeprot;
 
-	/*
-	 * Code needs to be expanded to handle huge swap and migration
-	 * entries. Needed for HUGETLB and MEMORY_FAILURE.
-	 */
-	WARN_ON(!pte_present(pte));
+	if (!pte_present(pte)) {
+		struct folio *folio;
+
+		folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte));
+		ncontig = num_contig_ptes(folio_size(folio), &pgsize);
+
+		for (i = 0; i < ncontig; i++, ptep++)
+			set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
 
 	if (!pte_cont(pte)) {
 		set_pte_at(mm, addr, ptep, pte);
@@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
 }
 
-void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-			  pte_t *ptep, pte_t pte, unsigned long sz)
-{
-	int i, ncontig;
-	size_t pgsize;
-
-	ncontig = num_contig_ptes(sz, &pgsize);
-
-	for (i = 0; i < ncontig; i++, ptep++)
-		set_pte(ptep, pte);
-}
-
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, unsigned long sz)
 {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 756b66ff025e..c6cccfaf8708 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -903,14 +903,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 	atomic_long_sub(l, &mm->hugetlb_usage);
 }
 
-#ifndef set_huge_swap_pte_at
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte, unsigned long sz)
-{
-	set_huge_pte_at(mm, addr, ptep, pte);
-}
-#endif
-
 #ifndef huge_ptep_modify_prot_start
 #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
 static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
@@ -1094,11 +1086,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 {
 }
 
-static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte, unsigned long sz)
-{
-}
-
 static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 					  unsigned long addr, pte_t *ptep)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65454896f174..064da8ffbac6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4798,12 +4798,11 @@ again:
 				entry = swp_entry_to_pte(swp_entry);
 				if (userfaultfd_wp(src_vma) && uffd_wp)
 					entry = huge_pte_mkuffd_wp(entry);
-				set_huge_swap_pte_at(src, addr, src_pte,
-						     entry, sz);
+				set_huge_pte_at(src, addr, src_pte, entry);
 			}
 			if (!userfaultfd_wp(dst_vma) && uffd_wp)
 				entry = huge_pte_clear_uffd_wp(entry);
-			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+			set_huge_pte_at(dst, addr, dst_pte, entry);
 		} else if (unlikely(is_pte_marker(entry))) {
 			/*
 			 * We copy the pte marker only if the dst vma has
@@ -6344,8 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 					newpte = pte_swp_mkuffd_wp(newpte);
 				else if (uffd_wp_resolve)
 					newpte = pte_swp_clear_uffd_wp(newpte);
-				set_huge_swap_pte_at(mm, address, ptep,
-						     newpte, psize);
+				set_huge_pte_at(mm, address, ptep, newpte);
 				pages++;
 			}
 			spin_unlock(ptl);
diff --git a/mm/rmap.c b/mm/rmap.c
index 56134cdc5ca3..83172ee0ea35 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1618,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_swap_pte_at(mm, address,
-						     pvmw.pte, pteval,
-						     vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2004,9 +2002,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (folio_test_hugetlb(folio)) {
 				hugetlb_count_sub(folio_nr_pages(folio), mm);
-				set_huge_swap_pte_at(mm, address,
-						     pvmw.pte, pteval,
-						     vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, pteval);
 			} else {
 				dec_mm_counter(mm, mm_counter(&folio->page));
 				set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2074,8 +2070,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			if (pte_uffd_wp(pteval))
 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
 			if (folio_test_hugetlb(folio))
-				set_huge_swap_pte_at(mm, address, pvmw.pte,
-						     swp_pte, vma_mmu_pagesize(vma));
+				set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
 			else
 				set_pte_at(mm, address, pvmw.pte, swp_pte);
 			trace_set_migration_pte(address, pte_val(swp_pte),
-- 
cgit v1.2.3


From 1baec203b77cafa24610b5c9ae7a2aa380d74ef6 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Sat, 25 Jun 2022 17:28:16 +0800
Subject: mm/khugepaged: try to free transhuge swapcache when possible

Transhuge swapcaches won't be freed in __collapse_huge_page_copy().  It's
because release_pte_page() is not called for these pages and thus
free_page_and_swap_cache can't grab the page lock.  These pages won't be
freed from swap cache even if we are the only user until next time
reclaim.  It shouldn't hurt indeed, but we could try to free these pages
to save more memory for system.

Link: https://lkml.kernel.org/r/20220625092816.4856-8-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: NeilBrown <neilb@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 5 +++++
 mm/khugepaged.c      | 7 ++++++-
 mm/swap.h            | 5 -----
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 95a5b7aa1ae9..6d11c51b2b62 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -455,6 +455,7 @@ static inline unsigned long total_swapcache_pages(void)
 	return global_node_page_state(NR_SWAPCACHE);
 }
 
+extern void free_swap_cache(struct page *page);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 /* linux/mm/swapfile.c */
@@ -539,6 +540,10 @@ static inline void put_swap_device(struct swap_info_struct *si)
 /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
 #define free_swap_and_cache(e) is_pfn_swap_entry(e)
 
+static inline void free_swap_cache(struct page *page)
+{
+}
+
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
 	return 0;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 08e885f28def..01e0d6336754 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -755,7 +755,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
 		list_del(&src_page->lru);
-		release_pte_page(src_page);
+		mod_node_page_state(page_pgdat(src_page),
+				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
+				    -compound_nr(src_page));
+		unlock_page(src_page);
+		free_swap_cache(src_page);
+		putback_lru_page(src_page);
 	}
 }
 
diff --git a/mm/swap.h b/mm/swap.h
index fa0816af4712..17936e068c1c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
-void free_swap_cache(struct page *page);
 struct page *lookup_swap_cache(swp_entry_t entry,
 			       struct vm_area_struct *vma,
 			       unsigned long addr);
@@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
 	return NULL;
 }
 
-static inline void free_swap_cache(struct page *page)
-{
-}
-
 static inline void show_swap_cache_info(void)
 {
 }
-- 
cgit v1.2.3


From 50d6281ce9b8412f7ef02d1bc9d23aa62ae0cf98 Mon Sep 17 00:00:00 2001
From: Ren Zhijie <renzhijie2@huawei.com>
Date: Thu, 30 Jun 2022 20:35:28 +0800
Subject: dma-mapping: Fix build error unused-value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_DMA_DECLARE_COHERENT is not set,
make ARCH=x86_64 CROSS_COMPILE=x86_64-linux-gnu- will be failed, like this:

drivers/remoteproc/remoteproc_core.c: In function ‘rproc_rvdev_release’:
./include/linux/dma-map-ops.h:182:42: error: statement with no effect [-Werror=unused-value]
 #define dma_release_coherent_memory(dev) (0)
                                          ^
drivers/remoteproc/remoteproc_core.c:464:2: note: in expansion of macro ‘dma_release_coherent_memory’
  dma_release_coherent_memory(dev);
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors

The return type of function dma_release_coherent_memory in CONFIG_DMA_DECLARE_COHERENT area is void, so in !CONFIG_DMA_DECLARE_COHERENT area it should neither return any value nor be defined as zero.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: e61c451476e6 ("dma-mapping: Add dma_release_coherent_memory to DMA API")
Signed-off-by: Ren Zhijie <renzhijie2@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220630123528.251181-1-renzhijie2@huawei.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/dma-map-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 53db9655efe9..bfffe494356a 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -179,10 +179,10 @@ static inline int dma_declare_coherent_memory(struct device *dev,
 	return -ENOSYS;
 }
 
-#define dma_release_coherent_memory(dev) (0)
 #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
+static inline void dma_release_coherent_memory(struct device *dev) { }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
 #ifdef CONFIG_DMA_GLOBAL_POOL
-- 
cgit v1.2.3


From 6a4e9307cd3782f8e805fac970b9a240ab3078d6 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Sat, 21 May 2022 13:11:12 +0200
Subject: dmaengine: qcom: fix typo in comment

Spelling mistake (triple letters) in comment.
Detected with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>

Link: https://lore.kernel.org/r/20220521111145.81697-62-Julia.Lawall@inria.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/qcom-gpi-dma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
index f46dc3372f11..6680dd1a43c6 100644
--- a/include/linux/dma/qcom-gpi-dma.h
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -26,7 +26,7 @@ enum spi_transfer_cmd {
  * @clk_div: source clock divider
  * @clk_src: serial clock
  * @cmd: spi cmd
- * @fragmentation: keep CS assserted at end of sequence
+ * @fragmentation: keep CS asserted at end of sequence
  * @cs: chip select toggle
  * @set_config: set peripheral config
  * @rx_len: receive length for buffer
-- 
cgit v1.2.3


From 3b7e2482f9a3f2e99628048b945c9c6cc53bd38e Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:36 +0100
Subject: iommu: Introduce a callback to struct iommu_resv_region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A callback is introduced to struct iommu_resv_region to free memory
allocations associated with the reserved region. This will be useful
when we introduce support for IORT RMR based reserved regions.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-2-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 16 +++++++++++-----
 include/linux/iommu.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cdc86c39954e..f46431ac49e1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2590,16 +2590,22 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
  * @list: reserved region list for device
  *
  * IOMMU drivers can use this to implement their .put_resv_regions() callback
- * for simple reservations. Memory allocated for each reserved region will be
- * freed. If an IOMMU driver allocates additional resources per region, it is
- * going to have to implement a custom callback.
+ * for simple reservations. If a per region callback is provided that will be
+ * used to free all memory allocations associated with the reserved region or
+ * else just free up the memory for the regions. If an IOMMU driver allocates
+ * additional resources per region, it is going to have to implement a custom
+ * callback.
  */
 void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
 {
 	struct iommu_resv_region *entry, *next;
 
-	list_for_each_entry_safe(entry, next, list, list)
-		kfree(entry);
+	list_for_each_entry_safe(entry, next, list, list) {
+		if (entry->free)
+			entry->free(dev, entry);
+		else
+			kfree(entry);
+	}
 }
 EXPORT_SYMBOL(generic_iommu_put_resv_regions);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e1afe169549..b22ffa6bc4a9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -135,6 +135,7 @@ enum iommu_resv_type {
  * @length: Length of the region in bytes
  * @prot: IOMMU Protection flags (READ/WRITE/...)
  * @type: Type of the reserved region
+ * @free: Callback to free associated memory allocations
  */
 struct iommu_resv_region {
 	struct list_head	list;
@@ -142,6 +143,7 @@ struct iommu_resv_region {
 	size_t			length;
 	int			prot;
 	enum iommu_resv_type	type;
+	void (*free)(struct device *dev, struct iommu_resv_region *region);
 };
 
 /**
-- 
cgit v1.2.3


From 8778b1d48117055c710a01498f65fa730160fdfa Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:37 +0100
Subject: ACPI/IORT: Make iort_iommu_msi_get_resv_regions() return void

At present iort_iommu_msi_get_resv_regions() returns the number of
MSI reserved regions on success and there are no users for this.
The reserved region list will get populated anyway for platforms
that require the HW MSI region reservation. Hence, change the
function to return void instead.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-3-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 25 +++++++++----------------
 include/linux/acpi_iort.h |  6 +++---
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index f2f8f05662de..213f61cae176 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -811,22 +811,19 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
  * @dev: Device from iommu_get_resv_regions()
  * @head: Reserved region list from iommu_get_resv_regions()
  *
- * Returns: Number of msi reserved regions on success (0 if platform
- *          doesn't require the reservation or no associated msi regions),
- *          appropriate error value otherwise. The ITS interrupt translation
- *          spaces (ITS_base + SZ_64K, SZ_64K) associated with the device
- *          are the msi reserved regions.
+ * The ITS interrupt translation spaces (ITS_base + SZ_64K, SZ_64K)
+ * associated with the device are the HW MSI reserved regions.
  */
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct acpi_iort_its_group *its;
 	struct acpi_iort_node *iommu_node, *its_node = NULL;
-	int i, resv = 0;
+	int i;
 
 	iommu_node = iort_get_msi_resv_iommu(dev);
 	if (!iommu_node)
-		return 0;
+		return;
 
 	/*
 	 * Current logic to reserve ITS regions relies on HW topologies
@@ -846,7 +843,7 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 	}
 
 	if (!its_node)
-		return 0;
+		return;
 
 	/* Move to ITS specific data */
 	its = (struct acpi_iort_its_group *)its_node->node_data;
@@ -860,14 +857,10 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 
 			region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K,
 							 prot, IOMMU_RESV_MSI);
-			if (region) {
+			if (region)
 				list_add_tail(&region->list, head);
-				resv++;
-			}
 		}
 	}
-
-	return (resv == its->its_count) ? resv : -ENODEV;
 }
 
 static inline bool iort_iommu_driver_enabled(u8 type)
@@ -1034,8 +1027,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 }
 
 #else
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
-{ return 0; }
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ }
 int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
 { return -ENODEV; }
 #endif
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index f1f0842a2cb2..a8198b83753d 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -36,7 +36,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
 static inline void acpi_iort_init(void) { }
@@ -52,8 +52,8 @@ static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 { return -ENODEV; }
 static inline
-int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
-{ return 0; }
+void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+{ }
 
 static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void)
 { return PHYS_ADDR_MAX; }
-- 
cgit v1.2.3


From 55be25b8b5e4e2fd680cfb073b84a74a79c002fa Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:38 +0100
Subject: ACPI/IORT: Provide a generic helper to retrieve reserve regions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently IORT provides a helper to retrieve HW MSI reserve regions.
Change this to a generic helper to retrieve any IORT related reserve
regions. This will be useful when we add support for RMR nodes in
subsequent patches.

[Lorenzo: For ACPI IORT]

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-4-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 22 +++++++++++++++-------
 drivers/iommu/dma-iommu.c |  2 +-
 include/linux/acpi_iort.h |  4 ++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 213f61cae176..cd5d1d7823cb 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -806,15 +806,13 @@ static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 	return NULL;
 }
 
-/**
- * iort_iommu_msi_get_resv_regions - Reserved region driver helper
- * @dev: Device from iommu_get_resv_regions()
- * @head: Reserved region list from iommu_get_resv_regions()
- *
+/*
+ * Retrieve platform specific HW MSI reserve regions.
  * The ITS interrupt translation spaces (ITS_base + SZ_64K, SZ_64K)
  * associated with the device are the HW MSI reserved regions.
  */
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+static void iort_iommu_msi_get_resv_regions(struct device *dev,
+					    struct list_head *head)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct acpi_iort_its_group *its;
@@ -863,6 +861,16 @@ void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 	}
 }
 
+/**
+ * iort_iommu_get_resv_regions - Generic helper to retrieve reserved regions.
+ * @dev: Device from iommu_get_resv_regions()
+ * @head: Reserved region list from iommu_get_resv_regions()
+ */
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+	iort_iommu_msi_get_resv_regions(dev, head);
+}
+
 static inline bool iort_iommu_driver_enabled(u8 type)
 {
 	switch (type) {
@@ -1027,7 +1035,7 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 }
 
 #else
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 { }
 int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
 { return -ENODEV; }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 1910f4f1612b..458fb6738223 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -387,7 +387,7 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
 
 	if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))
-		iort_iommu_msi_get_resv_regions(dev, list);
+		iort_iommu_get_resv_regions(dev, list);
 
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index a8198b83753d..e5d2de9caf7f 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -36,7 +36,7 @@ int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
 static inline void acpi_iort_init(void) { }
@@ -52,7 +52,7 @@ static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 { return -ENODEV; }
 static inline
-void iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
+void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 { }
 
 static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void)
-- 
cgit v1.2.3


From 491cf4a6735a957cd0733365f8d17e0f4308f5a4 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:39 +0100
Subject: ACPI/IORT: Add support to retrieve IORT RMR reserved regions

Parse through the IORT RMR nodes and populate the reserve region list
corresponding to a given IOMMU and device(optional). Also, go through
the ID mappings of the RMR node and retrieve all the SIDs associated
with it.

Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-5-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h     |   8 ++
 2 files changed, 299 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index cd5d1d7823cb..b6273af316c6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -788,6 +788,294 @@ void acpi_configure_pmsi_domain(struct device *dev)
 }
 
 #ifdef CONFIG_IOMMU_API
+static void iort_rmr_free(struct device *dev,
+			  struct iommu_resv_region *region)
+{
+	struct iommu_iort_rmr_data *rmr_data;
+
+	rmr_data = container_of(region, struct iommu_iort_rmr_data, rr);
+	kfree(rmr_data->sids);
+	kfree(rmr_data);
+}
+
+static struct iommu_iort_rmr_data *iort_rmr_alloc(
+					struct acpi_iort_rmr_desc *rmr_desc,
+					int prot, enum iommu_resv_type type,
+					u32 *sids, u32 num_sids)
+{
+	struct iommu_iort_rmr_data *rmr_data;
+	struct iommu_resv_region *region;
+	u32 *sids_copy;
+	u64 addr = rmr_desc->base_address, size = rmr_desc->length;
+
+	rmr_data = kmalloc(sizeof(*rmr_data), GFP_KERNEL);
+	if (!rmr_data)
+		return NULL;
+
+	/* Create a copy of SIDs array to associate with this rmr_data */
+	sids_copy = kmemdup(sids, num_sids * sizeof(*sids), GFP_KERNEL);
+	if (!sids_copy) {
+		kfree(rmr_data);
+		return NULL;
+	}
+	rmr_data->sids = sids_copy;
+	rmr_data->num_sids = num_sids;
+
+	if (!IS_ALIGNED(addr, SZ_64K) || !IS_ALIGNED(size, SZ_64K)) {
+		/* PAGE align base addr and size */
+		addr &= PAGE_MASK;
+		size = PAGE_ALIGN(size + offset_in_page(rmr_desc->base_address));
+
+		pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] not aligned to 64K, continue with [0x%llx - 0x%llx]\n",
+		       rmr_desc->base_address,
+		       rmr_desc->base_address + rmr_desc->length - 1,
+		       addr, addr + size - 1);
+	}
+
+	region = &rmr_data->rr;
+	INIT_LIST_HEAD(&region->list);
+	region->start = addr;
+	region->length = size;
+	region->prot = prot;
+	region->type = type;
+	region->free = iort_rmr_free;
+
+	return rmr_data;
+}
+
+static void iort_rmr_desc_check_overlap(struct acpi_iort_rmr_desc *desc,
+					u32 count)
+{
+	int i, j;
+
+	for (i = 0; i < count; i++) {
+		u64 end, start = desc[i].base_address, length = desc[i].length;
+
+		if (!length) {
+			pr_err(FW_BUG "RMR descriptor[0x%llx] with zero length, continue anyway\n",
+			       start);
+			continue;
+		}
+
+		end = start + length - 1;
+
+		/* Check for address overlap */
+		for (j = i + 1; j < count; j++) {
+			u64 e_start = desc[j].base_address;
+			u64 e_end = e_start + desc[j].length - 1;
+
+			if (start <= e_end && end >= e_start)
+				pr_err(FW_BUG "RMR descriptor[0x%llx - 0x%llx] overlaps, continue anyway\n",
+				       start, end);
+		}
+	}
+}
+
+/*
+ * Please note, we will keep the already allocated RMR reserve
+ * regions in case of a memory allocation failure.
+ */
+static void iort_get_rmrs(struct acpi_iort_node *node,
+			  struct acpi_iort_node *smmu,
+			  u32 *sids, u32 num_sids,
+			  struct list_head *head)
+{
+	struct acpi_iort_rmr *rmr = (struct acpi_iort_rmr *)node->node_data;
+	struct acpi_iort_rmr_desc *rmr_desc;
+	int i;
+
+	rmr_desc = ACPI_ADD_PTR(struct acpi_iort_rmr_desc, node,
+				rmr->rmr_offset);
+
+	iort_rmr_desc_check_overlap(rmr_desc, rmr->rmr_count);
+
+	for (i = 0; i < rmr->rmr_count; i++, rmr_desc++) {
+		struct iommu_iort_rmr_data *rmr_data;
+		enum iommu_resv_type type;
+		int prot = IOMMU_READ | IOMMU_WRITE;
+
+		if (rmr->flags & ACPI_IORT_RMR_REMAP_PERMITTED)
+			type = IOMMU_RESV_DIRECT_RELAXABLE;
+		else
+			type = IOMMU_RESV_DIRECT;
+
+		if (rmr->flags & ACPI_IORT_RMR_ACCESS_PRIVILEGE)
+			prot |= IOMMU_PRIV;
+
+		/* Attributes 0x00 - 0x03 represents device memory */
+		if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) <=
+				ACPI_IORT_RMR_ATTR_DEVICE_GRE)
+			prot |= IOMMU_MMIO;
+		else if (ACPI_IORT_RMR_ACCESS_ATTRIBUTES(rmr->flags) ==
+				ACPI_IORT_RMR_ATTR_NORMAL_IWB_OWB)
+			prot |= IOMMU_CACHE;
+
+		rmr_data = iort_rmr_alloc(rmr_desc, prot, type,
+					  sids, num_sids);
+		if (!rmr_data)
+			return;
+
+		list_add_tail(&rmr_data->rr.list, head);
+	}
+}
+
+static u32 *iort_rmr_alloc_sids(u32 *sids, u32 count, u32 id_start,
+				u32 new_count)
+{
+	u32 *new_sids;
+	u32 total_count = count + new_count;
+	int i;
+
+	new_sids = krealloc_array(sids, count + new_count,
+				  sizeof(*new_sids), GFP_KERNEL);
+	if (!new_sids)
+		return NULL;
+
+	for (i = count; i < total_count; i++)
+		new_sids[i] = id_start++;
+
+	return new_sids;
+}
+
+static bool iort_rmr_has_dev(struct device *dev, u32 id_start,
+			     u32 id_count)
+{
+	int i;
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	/*
+	 * Make sure the kernel has preserved the boot firmware PCIe
+	 * configuration. This is required to ensure that the RMR PCIe
+	 * StreamIDs are still valid (Refer: ARM DEN 0049E.d Section 3.1.1.5).
+	 */
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_host_bridge *host = pci_find_host_bridge(pdev->bus);
+
+		if (!host->preserve_config)
+			return false;
+	}
+
+	for (i = 0; i < fwspec->num_ids; i++) {
+		if (fwspec->ids[i] >= id_start &&
+		    fwspec->ids[i] <= id_start + id_count)
+			return true;
+	}
+
+	return false;
+}
+
+static void iort_node_get_rmr_info(struct acpi_iort_node *node,
+				   struct acpi_iort_node *iommu,
+				   struct device *dev, struct list_head *head)
+{
+	struct acpi_iort_node *smmu = NULL;
+	struct acpi_iort_rmr *rmr;
+	struct acpi_iort_id_mapping *map;
+	u32 *sids = NULL;
+	u32 num_sids = 0;
+	int i;
+
+	if (!node->mapping_offset || !node->mapping_count) {
+		pr_err(FW_BUG "Invalid ID mapping, skipping RMR node %p\n",
+		       node);
+		return;
+	}
+
+	rmr = (struct acpi_iort_rmr *)node->node_data;
+	if (!rmr->rmr_offset || !rmr->rmr_count)
+		return;
+
+	map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, node,
+			   node->mapping_offset);
+
+	/*
+	 * Go through the ID mappings and see if we have a match for SMMU
+	 * and dev(if !NULL). If found, get the sids for the Node.
+	 * Please note, id_count is equal to the number of IDs  in the
+	 * range minus one.
+	 */
+	for (i = 0; i < node->mapping_count; i++, map++) {
+		struct acpi_iort_node *parent;
+
+		if (!map->id_count)
+			continue;
+
+		parent = ACPI_ADD_PTR(struct acpi_iort_node, iort_table,
+				      map->output_reference);
+		if (parent != iommu)
+			continue;
+
+		/* If dev is valid, check RMR node corresponds to the dev SID */
+		if (dev && !iort_rmr_has_dev(dev, map->output_base,
+					     map->id_count))
+			continue;
+
+		/* Retrieve SIDs associated with the Node. */
+		sids = iort_rmr_alloc_sids(sids, num_sids, map->output_base,
+					   map->id_count + 1);
+		if (!sids)
+			return;
+
+		num_sids += map->id_count + 1;
+	}
+
+	if (!sids)
+		return;
+
+	iort_get_rmrs(node, smmu, sids, num_sids, head);
+	kfree(sids);
+}
+
+static void iort_find_rmrs(struct acpi_iort_node *iommu, struct device *dev,
+			   struct list_head *head)
+{
+	struct acpi_table_iort *iort;
+	struct acpi_iort_node *iort_node, *iort_end;
+	int i;
+
+	/* Only supports ARM DEN 0049E.d onwards */
+	if (iort_table->revision < 5)
+		return;
+
+	iort = (struct acpi_table_iort *)iort_table;
+
+	iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				 iort->node_offset);
+	iort_end = ACPI_ADD_PTR(struct acpi_iort_node, iort,
+				iort_table->length);
+
+	for (i = 0; i < iort->node_count; i++) {
+		if (WARN_TAINT(iort_node >= iort_end, TAINT_FIRMWARE_WORKAROUND,
+			       "IORT node pointer overflows, bad table!\n"))
+			return;
+
+		if (iort_node->type == ACPI_IORT_NODE_RMR)
+			iort_node_get_rmr_info(iort_node, iommu, dev, head);
+
+		iort_node = ACPI_ADD_PTR(struct acpi_iort_node, iort_node,
+					 iort_node->length);
+	}
+}
+
+/*
+ * Populate the RMR list associated with a given IOMMU and dev(if provided).
+ * If dev is NULL, the function populates all the RMRs associated with the
+ * given IOMMU.
+ */
+static void iort_iommu_rmr_get_resv_regions(struct fwnode_handle *iommu_fwnode,
+					    struct device *dev,
+					    struct list_head *head)
+{
+	struct acpi_iort_node *iommu;
+
+	iommu = iort_get_iort_node(iommu_fwnode);
+	if (!iommu)
+		return;
+
+	iort_find_rmrs(iommu, dev, head);
+}
+
 static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 {
 	struct acpi_iort_node *iommu;
@@ -868,7 +1156,10 @@ static void iort_iommu_msi_get_resv_regions(struct device *dev,
  */
 void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head)
 {
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
 	iort_iommu_msi_get_resv_regions(dev, head);
+	iort_iommu_rmr_get_resv_regions(fwspec->iommu_fwnode, dev, head);
 }
 
 static inline bool iort_iommu_driver_enabled(u8 type)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b22ffa6bc4a9..e6abd998dbe7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -146,6 +146,14 @@ struct iommu_resv_region {
 	void (*free)(struct device *dev, struct iommu_resv_region *region);
 };
 
+struct iommu_iort_rmr_data {
+	struct iommu_resv_region rr;
+
+	/* Stream IDs associated with IORT RMR entry */
+	const u32 *sids;
+	u32 num_sids;
+};
+
 /**
  * enum iommu_dev_features - Per device IOMMU features
  * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
-- 
cgit v1.2.3


From e302eea8f49717253ac64fd45b7cc719e87fa010 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 15 Jun 2022 11:10:40 +0100
Subject: ACPI/IORT: Add a helper to retrieve RMR info directly

This will provide a way for SMMU drivers to retrieve StreamIDs
associated with IORT RMR nodes and use that to set bypass settings
for those IDs.

Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20220615101044.1972-6-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c | 28 ++++++++++++++++++++++++++++
 include/linux/acpi_iort.h |  8 ++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index b6273af316c6..cd1349d3544e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1394,6 +1394,34 @@ int iort_dma_get_ranges(struct device *dev, u64 *size)
 		return nc_dma_get_range(dev, size);
 }
 
+/**
+ * iort_get_rmr_sids - Retrieve IORT RMR node reserved regions with
+ *                     associated StreamIDs information.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @head: Resereved region list
+ */
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head)
+{
+	iort_iommu_rmr_get_resv_regions(iommu_fwnode, NULL, head);
+}
+EXPORT_SYMBOL_GPL(iort_get_rmr_sids);
+
+/**
+ * iort_put_rmr_sids - Free memory allocated for RMR reserved regions.
+ * @iommu_fwnode: fwnode associated with IOMMU
+ * @head: Resereved region list
+ */
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head)
+{
+	struct iommu_resv_region *entry, *next;
+
+	list_for_each_entry_safe(entry, next, head, list)
+		entry->free(NULL, entry);
+}
+EXPORT_SYMBOL_GPL(iort_put_rmr_sids);
+
 static void __init acpi_iort_register_irq(int hwirq, const char *name,
 					  int trigger,
 					  struct resource *res)
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index e5d2de9caf7f..b43be0987b19 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -33,6 +33,10 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
 					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head);
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode,
+		       struct list_head *head);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
@@ -46,6 +50,10 @@ static inline struct irq_domain *iort_get_device_domain(
 	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
+static inline
+void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head) { }
+static inline
+void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head) { }
 /* IOMMU interface */
 static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 { return -ENODEV; }
-- 
cgit v1.2.3


From bfdd231374181254742c5e2faef0bef2d30c0ee4 Mon Sep 17 00:00:00 2001
From: Yunfei Wang <yf.wang@mediatek.com>
Date: Thu, 30 Jun 2022 17:29:25 +0800
Subject: iommu/io-pgtable-arm-v7s: Add a quirk to allow pgtable PA up to 35bit

Single memory zone feature will remove ZONE_DMA32 and ZONE_DMA and
cause pgtable PA size larger than 32bit.

Since Mediatek IOMMU hardware support at most 35bit PA in pgtable,
so add a quirk to allow the PA of pgtables support up to bit35.

Signed-off-by: Ning Li <ning.li@mediatek.com>
Signed-off-by: Yunfei Wang <yf.wang@mediatek.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20220630092927.24925-2-yf.wang@mediatek.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/io-pgtable-arm-v7s.c | 75 ++++++++++++++++++++++++++++----------
 include/linux/io-pgtable.h         | 15 +++++---
 2 files changed, 66 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index be066c1503d3..ba3115fd0f86 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -182,14 +182,8 @@ static bool arm_v7s_is_mtk_enabled(struct io_pgtable_cfg *cfg)
 		(cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_EXT);
 }
 
-static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
-				    struct io_pgtable_cfg *cfg)
+static arm_v7s_iopte to_mtk_iopte(phys_addr_t paddr, arm_v7s_iopte pte)
 {
-	arm_v7s_iopte pte = paddr & ARM_V7S_LVL_MASK(lvl);
-
-	if (!arm_v7s_is_mtk_enabled(cfg))
-		return pte;
-
 	if (paddr & BIT_ULL(32))
 		pte |= ARM_V7S_ATTR_MTK_PA_BIT32;
 	if (paddr & BIT_ULL(33))
@@ -199,6 +193,17 @@ static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
 	return pte;
 }
 
+static arm_v7s_iopte paddr_to_iopte(phys_addr_t paddr, int lvl,
+				    struct io_pgtable_cfg *cfg)
+{
+	arm_v7s_iopte pte = paddr & ARM_V7S_LVL_MASK(lvl);
+
+	if (arm_v7s_is_mtk_enabled(cfg))
+		return to_mtk_iopte(paddr, pte);
+
+	return pte;
+}
+
 static phys_addr_t iopte_to_paddr(arm_v7s_iopte pte, int lvl,
 				  struct io_pgtable_cfg *cfg)
 {
@@ -240,10 +245,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 	dma_addr_t dma;
 	size_t size = ARM_V7S_TABLE_SIZE(lvl, cfg);
 	void *table = NULL;
+	gfp_t gfp_l1;
+
+	/*
+	 * ARM_MTK_TTBR_EXT extend the translation table base support larger
+	 * memory address.
+	 */
+	gfp_l1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+		 GFP_KERNEL : ARM_V7S_TABLE_GFP_DMA;
 
 	if (lvl == 1)
-		table = (void *)__get_free_pages(
-			__GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
+		table = (void *)__get_free_pages(gfp_l1 | __GFP_ZERO, get_order(size));
 	else if (lvl == 2)
 		table = kmem_cache_zalloc(data->l2_tables, gfp);
 
@@ -251,7 +263,8 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
 		return NULL;
 
 	phys = virt_to_phys(table);
-	if (phys != (arm_v7s_iopte)phys) {
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+	    phys >= (1ULL << cfg->oas) : phys != (arm_v7s_iopte)phys) {
 		/* Doesn't fit in PTE */
 		dev_err(dev, "Page table does not fit in PTE: %pa", &phys);
 		goto out_free;
@@ -457,9 +470,14 @@ static arm_v7s_iopte arm_v7s_install_table(arm_v7s_iopte *table,
 					   arm_v7s_iopte curr,
 					   struct io_pgtable_cfg *cfg)
 {
+	phys_addr_t phys = virt_to_phys(table);
 	arm_v7s_iopte old, new;
 
-	new = virt_to_phys(table) | ARM_V7S_PTE_TYPE_TABLE;
+	new = phys | ARM_V7S_PTE_TYPE_TABLE;
+
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT)
+		new = to_mtk_iopte(phys, new);
+
 	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 		new |= ARM_V7S_ATTR_NS_TABLE;
 
@@ -779,6 +797,8 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 						void *cookie)
 {
 	struct arm_v7s_io_pgtable *data;
+	slab_flags_t slab_flag;
+	phys_addr_t paddr;
 
 	if (cfg->ias > (arm_v7s_is_mtk_enabled(cfg) ? 34 : ARM_V7S_ADDR_BITS))
 		return NULL;
@@ -788,7 +808,8 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_NO_PERMS |
-			    IO_PGTABLE_QUIRK_ARM_MTK_EXT))
+			    IO_PGTABLE_QUIRK_ARM_MTK_EXT |
+			    IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT))
 		return NULL;
 
 	/* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */
@@ -796,15 +817,27 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	    !(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS))
 			return NULL;
 
+	if ((cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT) &&
+	    !arm_v7s_is_mtk_enabled(cfg))
+		return NULL;
+
 	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return NULL;
 
 	spin_lock_init(&data->split_lock);
+
+	/*
+	 * ARM_MTK_TTBR_EXT extend the translation table base support larger
+	 * memory address.
+	 */
+	slab_flag = cfg->quirks & IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT ?
+		    0 : ARM_V7S_TABLE_SLAB_FLAGS;
+
 	data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
 					    ARM_V7S_TABLE_SIZE(2, cfg),
 					    ARM_V7S_TABLE_SIZE(2, cfg),
-					    ARM_V7S_TABLE_SLAB_FLAGS, NULL);
+					    slab_flag, NULL);
 	if (!data->l2_tables)
 		goto out_free_data;
 
@@ -850,12 +883,16 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	wmb();
 
 	/* TTBR */
-	cfg->arm_v7s_cfg.ttbr = virt_to_phys(data->pgd) | ARM_V7S_TTBR_S |
-				(cfg->coherent_walk ? (ARM_V7S_TTBR_NOS |
-				 ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
-				 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA)) :
-				(ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_NC) |
-				 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_NC)));
+	paddr = virt_to_phys(data->pgd);
+	if (arm_v7s_is_mtk_enabled(cfg))
+		cfg->arm_v7s_cfg.ttbr = paddr | upper_32_bits(paddr);
+	else
+		cfg->arm_v7s_cfg.ttbr = paddr | ARM_V7S_TTBR_S |
+					(cfg->coherent_walk ? (ARM_V7S_TTBR_NOS |
+					 ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_WBWA) |
+					 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_WBWA)) :
+					(ARM_V7S_TTBR_IRGN_ATTR(ARM_V7S_RGN_NC) |
+					 ARM_V7S_TTBR_ORGN_ATTR(ARM_V7S_RGN_NC)));
 	return &data->iop;
 
 out_free_data:
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 86af6f0a00a2..ca98aeadcc80 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -74,17 +74,22 @@ struct io_pgtable_cfg {
 	 *	to support up to 35 bits PA where the bit32, bit33 and bit34 are
 	 *	encoded in the bit9, bit4 and bit5 of the PTE respectively.
 	 *
+	 * IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT: (ARM v7s format) MediaTek IOMMUs
+	 *	extend the translation table base support up to 35 bits PA, the
+	 *	encoding format is same with IO_PGTABLE_QUIRK_ARM_MTK_EXT.
+	 *
 	 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 	 *	for use in the upper half of a split address space.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
 	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 */
-	#define IO_PGTABLE_QUIRK_ARM_NS		BIT(0)
-	#define IO_PGTABLE_QUIRK_NO_PERMS	BIT(1)
-	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT	BIT(3)
-	#define IO_PGTABLE_QUIRK_ARM_TTBR1	BIT(5)
-	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA	BIT(6)
+	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
+	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_EXT		BIT(3)
+	#define IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT	BIT(4)
+	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
+	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
-- 
cgit v1.2.3


From 3566ee1d776c1393393564b2514f9cd52a49c16e Mon Sep 17 00:00:00 2001
From: Michael Kawano <mkawano@linux.ibm.com>
Date: Thu, 7 Jul 2022 15:57:27 +0200
Subject: vfio/ccw: Remove UUID from s390 debug log

As vfio-ccw devices are created/destroyed, the uuid of the associated
mdevs that are recorded in $S390DBF/vfio_ccw_msg/sprintf get lost.
This is because a pointer to the UUID is stored instead of the UUID
itself, and that memory may have been repurposed if/when the logs are
examined. The result is usually garbage UUID data in the logs, though
there is an outside chance of an oops happening here.

Simply remove the UUID from the traces, as the subchannel number will
provide useful configuration information for problem determination,
and is stored directly into the log instead of a pointer.

As we were the only consumer of mdev_uuid(), remove that too.

Cc: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Michael Kawano <mkawano@linux.ibm.com>
Fixes: 60e05d1cf0875 ("vfio-ccw: add some logging")
Fixes: b7701dfbf9832 ("vfio-ccw: Register a chp_event callback for vfio-ccw")
[farman: reworded commit message, added Fixes: tags]
Signed-off-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20220707135737.720765-2-farman@linux.ibm.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/s390/cio/vfio_ccw_drv.c |  5 ++---
 drivers/s390/cio/vfio_ccw_fsm.c | 26 ++++++++++++--------------
 drivers/s390/cio/vfio_ccw_ops.c |  8 ++++----
 include/linux/mdev.h            |  5 -----
 4 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index ee182cfb467d..35055eb94115 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -14,7 +14,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/uuid.h>
 #include <linux/mdev.h>
 
 #include <asm/isc.h>
@@ -358,8 +357,8 @@ static int vfio_ccw_chp_event(struct subchannel *sch,
 		return 0;
 
 	trace_vfio_ccw_chp_event(private->sch->schid, mask, event);
-	VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): mask=0x%x event=%d\n",
-			   mdev_uuid(private->mdev), sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n",
+			   sch->schid.cssid,
 			   sch->schid.ssid, sch->schid.sch_no,
 			   mask, event);
 
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index 8483a266051c..bbcc5b486749 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/vfio.h>
-#include <linux/mdev.h>
 
 #include "ioasm.h"
 #include "vfio_ccw_private.h"
@@ -242,7 +241,6 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 	union orb *orb;
 	union scsw *scsw = &private->scsw;
 	struct ccw_io_region *io_region = private->io_region;
-	struct mdev_device *mdev = private->mdev;
 	char *errstr = "request";
 	struct subchannel_id schid = get_schid(private);
 
@@ -256,8 +254,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		if (orb->tm.b) {
 			io_region->ret_code = -EOPNOTSUPP;
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): transport mode\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: transport mode\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no);
 			errstr = "transport mode";
 			goto err_out;
@@ -265,8 +263,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = cp_init(&private->cp, orb);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): cp_init=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: cp_init=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp init";
@@ -276,8 +274,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = cp_prefetch(&private->cp);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): cp_prefetch=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: cp_prefetch=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp prefetch";
@@ -289,8 +287,8 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		io_region->ret_code = fsm_io_helper(private);
 		if (io_region->ret_code) {
 			VFIO_CCW_MSG_EVENT(2,
-					   "%pUl (%x.%x.%04x): fsm_io_helper=%d\n",
-					   mdev_uuid(mdev), schid.cssid,
+					   "sch %x.%x.%04x: fsm_io_helper=%d\n",
+					   schid.cssid,
 					   schid.ssid, schid.sch_no,
 					   io_region->ret_code);
 			errstr = "cp fsm_io_helper";
@@ -300,16 +298,16 @@ static void fsm_io_request(struct vfio_ccw_private *private,
 		return;
 	} else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) {
 		VFIO_CCW_MSG_EVENT(2,
-				   "%pUl (%x.%x.%04x): halt on io_region\n",
-				   mdev_uuid(mdev), schid.cssid,
+				   "sch %x.%x.%04x: halt on io_region\n",
+				   schid.cssid,
 				   schid.ssid, schid.sch_no);
 		/* halt is handled via the async cmd region */
 		io_region->ret_code = -EOPNOTSUPP;
 		goto err_out;
 	} else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) {
 		VFIO_CCW_MSG_EVENT(2,
-				   "%pUl (%x.%x.%04x): clear on io_region\n",
-				   mdev_uuid(mdev), schid.cssid,
+				   "sch %x.%x.%04x: clear on io_region\n",
+				   schid.cssid,
 				   schid.ssid, schid.sch_no);
 		/* clear is handled via the async cmd region */
 		io_region->ret_code = -EOPNOTSUPP;
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index b49e2e9db2dc..0e05bff78b8e 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -131,8 +131,8 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev)
 	private->mdev = mdev;
 	private->state = VFIO_CCW_STATE_IDLE;
 
-	VFIO_CCW_MSG_EVENT(2, "mdev %pUl, sch %x.%x.%04x: create\n",
-			   mdev_uuid(mdev), private->sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n",
+			   private->sch->schid.cssid,
 			   private->sch->schid.ssid,
 			   private->sch->schid.sch_no);
 
@@ -154,8 +154,8 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev)
 {
 	struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent);
 
-	VFIO_CCW_MSG_EVENT(2, "mdev %pUl, sch %x.%x.%04x: remove\n",
-			   mdev_uuid(mdev), private->sch->schid.cssid,
+	VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n",
+			   private->sch->schid.cssid,
 			   private->sch->schid.ssid,
 			   private->sch->schid.sch_no);
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index bb539794f54a..47ad3b104d9e 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -65,11 +65,6 @@ struct mdev_driver {
 	struct device_driver driver;
 };
 
-static inline const guid_t *mdev_uuid(struct mdev_device *mdev)
-{
-	return &mdev->uuid;
-}
-
 extern struct bus_type mdev_bus_type;
 
 int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver);
-- 
cgit v1.2.3


From 87686cc845c3be7dea777f1dbf2de0767007cda8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 16:10:39 +0530
Subject: OPP: Make dev_pm_opp_set_regulators() accept NULL terminated list

Make dev_pm_opp_set_regulators() accept a NULL terminated list of names
instead of making the callers keep the two parameters in sync, which
creates an opportunity for bugs to get in.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Steven Price <steven.price@arm.com> # panfrost
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c                |  9 ++++-----
 drivers/cpufreq/ti-cpufreq.c                |  7 +++----
 drivers/devfreq/exynos-bus.c                |  4 ++--
 drivers/gpu/drm/lima/lima_devfreq.c         |  3 ++-
 drivers/gpu/drm/panfrost/panfrost_devfreq.c |  3 +--
 drivers/gpu/drm/panfrost/panfrost_drv.c     | 15 ++++++++++-----
 drivers/opp/core.c                          | 18 ++++++++++++------
 drivers/soc/tegra/pmc.c                     |  4 ++--
 include/linux/pm_opp.h                      |  9 ++++-----
 9 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 8fcaba541539..be0c19b3ffa5 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -193,7 +193,7 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	struct private_data *priv;
 	struct device *cpu_dev;
 	bool fallback = false;
-	const char *reg_name;
+	const char *reg_name[] = { NULL, NULL };
 	int ret;
 
 	/* Check if this CPU is already covered by some other policy */
@@ -218,10 +218,9 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	 * OPP layer will be taking care of regulators now, but it needs to know
 	 * the name of the regulator first.
 	 */
-	reg_name = find_supply_name(cpu_dev);
-	if (reg_name) {
-		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, &reg_name,
-							    1);
+	reg_name[0] = find_supply_name(cpu_dev);
+	if (reg_name[0]) {
+		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, reg_name);
 		if (IS_ERR(priv->opp_table)) {
 			ret = PTR_ERR(priv->opp_table);
 			if (ret != -EPROBE_DEFER)
diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 8f9fdd864391..560d67a6bef1 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -173,7 +173,7 @@ static struct ti_cpufreq_soc_data omap34xx_soc_data = {
  *    seems to always read as 0).
  */
 
-static const char * const omap3_reg_names[] = {"cpu0", "vbb"};
+static const char * const omap3_reg_names[] = {"cpu0", "vbb", NULL};
 
 static struct ti_cpufreq_soc_data omap36xx_soc_data = {
 	.reg_names = omap3_reg_names,
@@ -326,7 +326,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 	const struct of_device_id *match;
 	struct opp_table *ti_opp_table;
 	struct ti_cpufreq_data *opp_data;
-	const char * const default_reg_names[] = {"vdd", "vbb"};
+	const char * const default_reg_names[] = {"vdd", "vbb", NULL};
 	int ret;
 
 	match = dev_get_platdata(&pdev->dev);
@@ -387,8 +387,7 @@ static int ti_cpufreq_probe(struct platform_device *pdev)
 		if (opp_data->soc_data->reg_names)
 			reg_names = opp_data->soc_data->reg_names;
 		ti_opp_table = dev_pm_opp_set_regulators(opp_data->cpu_dev,
-							 reg_names,
-							 ARRAY_SIZE(default_reg_names));
+							 reg_names);
 		if (IS_ERR(ti_opp_table)) {
 			dev_pm_opp_put_supported_hw(opp_data->opp_table);
 			ret =  PTR_ERR(ti_opp_table);
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index e689101abc93..541baff93ee8 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -180,10 +180,10 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 {
 	struct device *dev = bus->dev;
 	struct opp_table *opp_table;
-	const char *vdd = "vdd";
+	const char *supplies[] = { "vdd", NULL };
 	int i, ret, count, size;
 
-	opp_table = dev_pm_opp_set_regulators(dev, &vdd, 1);
+	opp_table = dev_pm_opp_set_regulators(dev, supplies);
 	if (IS_ERR(opp_table)) {
 		ret = PTR_ERR(opp_table);
 		dev_err(dev, "failed to set regulators %d\n", ret);
diff --git a/drivers/gpu/drm/lima/lima_devfreq.c b/drivers/gpu/drm/lima/lima_devfreq.c
index 8989e215dfc9..dc83c5421125 100644
--- a/drivers/gpu/drm/lima/lima_devfreq.c
+++ b/drivers/gpu/drm/lima/lima_devfreq.c
@@ -111,6 +111,7 @@ int lima_devfreq_init(struct lima_device *ldev)
 	struct dev_pm_opp *opp;
 	unsigned long cur_freq;
 	int ret;
+	const char *regulator_names[] = { "mali", NULL };
 
 	if (!device_property_present(dev, "operating-points-v2"))
 		/* Optional, continue without devfreq */
@@ -122,7 +123,7 @@ int lima_devfreq_init(struct lima_device *ldev)
 	if (ret)
 		return ret;
 
-	ret = devm_pm_opp_set_regulators(dev, (const char *[]){ "mali" }, 1);
+	ret = devm_pm_opp_set_regulators(dev, regulator_names);
 	if (ret) {
 		/* Continue if the optional regulator is missing */
 		if (ret != -ENODEV)
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 194af7f607a6..5110cd9b2425 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -101,8 +101,7 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
 		return 0;
 	}
 
-	ret = devm_pm_opp_set_regulators(dev, pfdev->comp->supply_names,
-					 pfdev->comp->num_supplies);
+	ret = devm_pm_opp_set_regulators(dev, pfdev->comp->supply_names);
 	if (ret) {
 		/* Continue if the optional regulator is missing */
 		if (ret != -ENODEV) {
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 7fcbc2a5b6cd..8a4bef65d38c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -625,24 +625,29 @@ static int panfrost_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const char * const default_supplies[] = { "mali" };
+/*
+ * The OPP core wants the supply names to be NULL terminated, but we need the
+ * correct num_supplies value for regulator core. Hence, we NULL terminate here
+ * and then initialize num_supplies with ARRAY_SIZE - 1.
+ */
+static const char * const default_supplies[] = { "mali", NULL };
 static const struct panfrost_compatible default_data = {
-	.num_supplies = ARRAY_SIZE(default_supplies),
+	.num_supplies = ARRAY_SIZE(default_supplies) - 1,
 	.supply_names = default_supplies,
 	.num_pm_domains = 1, /* optional */
 	.pm_domain_names = NULL,
 };
 
 static const struct panfrost_compatible amlogic_data = {
-	.num_supplies = ARRAY_SIZE(default_supplies),
+	.num_supplies = ARRAY_SIZE(default_supplies) - 1,
 	.supply_names = default_supplies,
 	.vendor_quirk = panfrost_gpu_amlogic_quirk,
 };
 
-static const char * const mediatek_mt8183_supplies[] = { "mali", "sram" };
+static const char * const mediatek_mt8183_supplies[] = { "mali", "sram", NULL };
 static const char * const mediatek_mt8183_pm_domains[] = { "core0", "core1", "core2" };
 static const struct panfrost_compatible mediatek_mt8183_data = {
-	.num_supplies = ARRAY_SIZE(mediatek_mt8183_supplies),
+	.num_supplies = ARRAY_SIZE(mediatek_mt8183_supplies) - 1,
 	.supply_names = mediatek_mt8183_supplies,
 	.num_pm_domains = ARRAY_SIZE(mediatek_mt8183_pm_domains),
 	.pm_domain_names = mediatek_mt8183_pm_domains,
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e166bfe5fc90..4e4593957ec5 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2105,13 +2105,20 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  * This must be called before any OPPs are initialized for the device.
  */
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[],
-					    unsigned int count)
+					    const char * const names[])
 {
 	struct dev_pm_opp_supply *supplies;
+	const char * const *temp = names;
 	struct opp_table *opp_table;
 	struct regulator *reg;
-	int ret, i;
+	int count = 0, ret, i;
+
+	/* Count number of regulators */
+	while (*temp++)
+		count++;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
 
 	opp_table = _add_opp_table(dev, false);
 	if (IS_ERR(opp_table))
@@ -2236,12 +2243,11 @@ static void devm_pm_opp_regulators_release(void *data)
  * Return: 0 on success and errorno otherwise.
  */
 int devm_pm_opp_set_regulators(struct device *dev,
-			       const char * const names[],
-			       unsigned int count)
+			       const char * const names[])
 {
 	struct opp_table *opp_table;
 
-	opp_table = dev_pm_opp_set_regulators(dev, names, count);
+	opp_table = dev_pm_opp_set_regulators(dev, names);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index 5611d14d3ba2..6a4b8f7e7948 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -1384,7 +1384,7 @@ tegra_pmc_core_pd_opp_to_performance_state(struct generic_pm_domain *genpd,
 static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 {
 	struct generic_pm_domain *genpd;
-	const char *rname = "core";
+	const char *rname[] = { "core", NULL};
 	int err;
 
 	genpd = devm_kzalloc(pmc->dev, sizeof(*genpd), GFP_KERNEL);
@@ -1395,7 +1395,7 @@ static int tegra_pmc_core_pd_add(struct tegra_pmc *pmc, struct device_node *np)
 	genpd->set_performance_state = tegra_pmc_core_pd_set_performance_state;
 	genpd->opp_to_performance_state = tegra_pmc_core_pd_opp_to_performance_state;
 
-	err = devm_pm_opp_set_regulators(pmc->dev, &rname, 1);
+	err = devm_pm_opp_set_regulators(pmc->dev, rname);
 	if (err)
 		return dev_err_probe(pmc->dev, err,
 				     "failed to set core OPP regulator\n");
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 6708b4ec244d..4c490865d574 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -159,9 +159,9 @@ void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-int devm_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+int devm_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -379,7 +379,7 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
+static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[])
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -387,8 +387,7 @@ static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, co
 static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
 static inline int devm_pm_opp_set_regulators(struct device *dev,
-					     const char * const names[],
-					     unsigned int count)
+					     const char * const names[])
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 11b9b663585c4f8b00846089ebbca4d1e3283e86 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 25 May 2022 15:23:16 +0530
Subject: OPP: Add dev_pm_opp_set_config() and friends

The OPP core already have few configuration specific APIs and it is
getting complex or messy for both the OPP core and its users.

Lets introduce a new set of API which will be used for all kind of
different configurations, and shall eventually be used by all the
existing ones.

The new API, returns a unique token instead of a pointer to the OPP
table, which allows the OPP core to drop the resources selectively later
on.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 229 ++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h      |  21 +++++
 include/linux/pm_opp.h |  42 +++++++++
 3 files changed, 291 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 4e4593957ec5..7ab20c3b91ed 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -13,11 +13,12 @@
 #include <linux/clk.h>
 #include <linux/errno.h>
 #include <linux/err.h>
-#include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/pm_domain.h>
 #include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
 
 #include "opp.h"
 
@@ -36,6 +37,9 @@ DEFINE_MUTEX(opp_table_lock);
 /* Flag indicating that opp_tables list is being updated at the moment */
 static bool opp_tables_busy;
 
+/* OPP ID allocator */
+static DEFINE_XARRAY_ALLOC1(opp_configs);
+
 static bool _find_opp_dev(const struct device *dev, struct opp_table *opp_table)
 {
 	struct opp_device *opp_dev;
@@ -2624,6 +2628,229 @@ int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names,
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
+static void _opp_clear_config(struct opp_config_data *data)
+{
+	if (data->flags & OPP_CONFIG_GENPD)
+		dev_pm_opp_detach_genpd(data->opp_table);
+	if (data->flags & OPP_CONFIG_REGULATOR)
+		dev_pm_opp_put_regulators(data->opp_table);
+	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
+		dev_pm_opp_put_supported_hw(data->opp_table);
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
+		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
+	if (data->flags & OPP_CONFIG_PROP_NAME)
+		dev_pm_opp_put_prop_name(data->opp_table);
+	if (data->flags & OPP_CONFIG_CLK)
+		dev_pm_opp_put_clkname(data->opp_table);
+
+	dev_pm_opp_put_opp_table(data->opp_table);
+	kfree(data);
+}
+
+/**
+ * dev_pm_opp_set_config() - Set OPP configuration for the device.
+ * @dev: Device for which configuration is being set.
+ * @config: OPP configuration.
+ *
+ * This allows all device OPP configurations to be performed at once.
+ *
+ * This must be called before any OPPs are initialized for the device. This may
+ * be called multiple times for the same OPP table, for example once for each
+ * CPU that share the same table. This must be balanced by the same number of
+ * calls to dev_pm_opp_clear_config() in order to free the OPP table properly.
+ *
+ * This returns a token to the caller, which must be passed to
+ * dev_pm_opp_clear_config() to free the resources later. The value of the
+ * returned token will be >= 1 for success and negative for errors. The minimum
+ * value of 1 is chosen here to make it easy for callers to manage the resource.
+ */
+int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	struct opp_table *opp_table, *err;
+	struct opp_config_data *data;
+	unsigned int id;
+	int ret;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	opp_table = _add_opp_table(dev, false);
+	if (IS_ERR(opp_table)) {
+		kfree(data);
+		return PTR_ERR(opp_table);
+	}
+
+	data->opp_table = opp_table;
+	data->flags = 0;
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Configure clocks */
+	if (config->clk_names) {
+		const char * const *temp = config->clk_names;
+		int count = 0;
+
+		/* Count number of clks */
+		while (*temp++)
+			count++;
+
+		/*
+		 * This is a special case where we have a single clock, whose
+		 * connection id name is NULL, i.e. first two entries are NULL
+		 * in the array.
+		 */
+		if (!count && !config->clk_names[1])
+			count = 1;
+
+		/* We support only one clock name for now */
+		if (count != 1) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		err = dev_pm_opp_set_clkname(dev, config->clk_names[0]);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_CLK;
+	}
+
+	/* Configure property names */
+	if (config->prop_name) {
+		err = dev_pm_opp_set_prop_name(dev, config->prop_name);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_PROP_NAME;
+	}
+
+	/* Configure opp helper */
+	if (config->set_opp) {
+		err = dev_pm_opp_register_set_opp_helper(dev, config->set_opp);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
+	}
+
+	/* Configure supported hardware */
+	if (config->supported_hw) {
+		err = dev_pm_opp_set_supported_hw(dev, config->supported_hw,
+						  config->supported_hw_count);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_SUPPORTED_HW;
+	}
+
+	/* Configure supplies */
+	if (config->regulator_names) {
+		err = dev_pm_opp_set_regulators(dev, config->regulator_names);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_REGULATOR;
+	}
+
+	/* Attach genpds */
+	if (config->genpd_names) {
+		err = dev_pm_opp_attach_genpd(dev, config->genpd_names,
+					      config->virt_devs);
+		if (IS_ERR(err)) {
+			ret = PTR_ERR(err);
+			goto err;
+		}
+
+		data->flags |= OPP_CONFIG_GENPD;
+	}
+
+	ret = xa_alloc(&opp_configs, &id, data, XA_LIMIT(1, INT_MAX),
+		       GFP_KERNEL);
+	if (ret)
+		goto err;
+
+	return id;
+
+err:
+	_opp_clear_config(data);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_config);
+
+/**
+ * dev_pm_opp_clear_config() - Releases resources blocked for OPP configuration.
+ * @opp_table: OPP table returned from dev_pm_opp_set_config().
+ *
+ * This allows all device OPP configurations to be cleared at once. This must be
+ * called once for each call made to dev_pm_opp_set_config(), in order to free
+ * the OPPs properly.
+ *
+ * Currently the first call itself ends up freeing all the OPP configurations,
+ * while the later ones only drop the OPP table reference. This works well for
+ * now as we would never want to use an half initialized OPP table and want to
+ * remove the configurations together.
+ */
+void dev_pm_opp_clear_config(int token)
+{
+	struct opp_config_data *data;
+
+	/*
+	 * This lets the callers call this unconditionally and keep their code
+	 * simple.
+	 */
+	if (unlikely(token <= 0))
+		return;
+
+	data = xa_erase(&opp_configs, token);
+	if (WARN_ON(!data))
+		return;
+
+	_opp_clear_config(data);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_clear_config);
+
+static void devm_pm_opp_config_release(void *token)
+{
+	dev_pm_opp_clear_config((unsigned long)token);
+}
+
+/**
+ * devm_pm_opp_set_config() - Set OPP configuration for the device.
+ * @dev: Device for which configuration is being set.
+ * @config: OPP configuration.
+ *
+ * This allows all device OPP configurations to be performed at once.
+ * This is a resource-managed variant of dev_pm_opp_set_config().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	int token = dev_pm_opp_set_config(dev, config);
+
+	if (token < 0)
+		return token;
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_config_release,
+					(void *) ((unsigned long) token));
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_config);
+
 /**
  * dev_pm_opp_xlate_required_opp() - Find required OPP for @src_table OPP.
  * @src_table: OPP table which has @dst_table as one of its required OPP table.
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 9e1cfcb0ea98..d652f0cc84f1 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -28,6 +28,27 @@ extern struct mutex opp_table_lock;
 
 extern struct list_head opp_tables, lazy_opp_tables;
 
+/* OPP Config flags */
+#define OPP_CONFIG_CLK			BIT(0)
+#define OPP_CONFIG_REGULATOR		BIT(1)
+#define OPP_CONFIG_REGULATOR_HELPER	BIT(2)
+#define OPP_CONFIG_PROP_NAME		BIT(3)
+#define OPP_CONFIG_SUPPORTED_HW		BIT(4)
+#define OPP_CONFIG_GENPD		BIT(5)
+
+/**
+ * struct opp_config_data - data for set config operations
+ * @opp_table: OPP table
+ * @flags: OPP config flags
+ *
+ * This structure stores the OPP config information for each OPP table
+ * configuration by the callers.
+ */
+struct opp_config_data {
+	struct opp_table *opp_table;
+	unsigned int flags;
+};
+
 /*
  * Internal data structure organization with the OPP layer library is as
  * follows:
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 4c490865d574..a08f9481efb3 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -90,6 +90,32 @@ struct dev_pm_set_opp_data {
 	struct device *dev;
 };
 
+/**
+ * struct dev_pm_opp_config - Device OPP configuration values
+ * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
+ * @prop_name: Name to postfix to properties.
+ * @set_opp: Custom set OPP helper.
+ * @supported_hw: Array of hierarchy of versions to match.
+ * @supported_hw_count: Number of elements in the array.
+ * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
+ * @genpd_names: Null terminated array of pointers containing names of genpd to
+ *		 attach.
+ * @virt_devs: Pointer to return the array of virtual devices.
+ *
+ * This structure contains platform specific OPP configurations for the device.
+ */
+struct dev_pm_opp_config {
+	/* NULL terminated */
+	const char * const *clk_names;
+	const char *prop_name;
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	const unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+	const char * const *regulator_names;
+	const char * const *genpd_names;
+	struct device ***virt_devs;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 struct opp_table *dev_pm_opp_get_opp_table(struct device *dev);
@@ -154,6 +180,10 @@ int dev_pm_opp_disable(struct device *dev, unsigned long freq);
 int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb);
 int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb);
 
+int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
+int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
+void dev_pm_opp_clear_config(int token);
+
 struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
@@ -418,6 +448,18 @@ static inline int devm_pm_opp_attach_genpd(struct device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void dev_pm_opp_clear_config(int token) {}
+
 static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
 				struct opp_table *dst_table, struct dev_pm_opp *src_opp)
 {
-- 
cgit v1.2.3


From b0ec09428621daee5101130c307634a390b0213b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 16:36:26 +0530
Subject: OPP: Migrate set-regulators API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-regulators family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c | 12 +++---
 drivers/devfreq/exynos-bus.c | 19 ++++-----
 drivers/opp/core.c           | 91 +++++++++-----------------------------------
 include/linux/pm_opp.h       | 44 +++++++++++++--------
 4 files changed, 60 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index be0c19b3ffa5..d69d13a26414 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -29,9 +29,9 @@ struct private_data {
 
 	cpumask_var_t cpus;
 	struct device *cpu_dev;
-	struct opp_table *opp_table;
 	struct cpufreq_frequency_table *freq_table;
 	bool have_static_opps;
+	int opp_token;
 };
 
 static LIST_HEAD(priv_list);
@@ -220,9 +220,9 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 	 */
 	reg_name[0] = find_supply_name(cpu_dev);
 	if (reg_name[0]) {
-		priv->opp_table = dev_pm_opp_set_regulators(cpu_dev, reg_name);
-		if (IS_ERR(priv->opp_table)) {
-			ret = PTR_ERR(priv->opp_table);
+		priv->opp_token = dev_pm_opp_set_regulators(cpu_dev, reg_name);
+		if (priv->opp_token < 0) {
+			ret = priv->opp_token;
 			if (ret != -EPROBE_DEFER)
 				dev_err(cpu_dev, "failed to set regulators: %d\n",
 					ret);
@@ -294,7 +294,7 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
 out:
 	if (priv->have_static_opps)
 		dev_pm_opp_of_cpumask_remove_table(priv->cpus);
-	dev_pm_opp_put_regulators(priv->opp_table);
+	dev_pm_opp_put_regulators(priv->opp_token);
 free_cpumask:
 	free_cpumask_var(priv->cpus);
 	return ret;
@@ -308,7 +308,7 @@ static void dt_cpufreq_release(void)
 		dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &priv->freq_table);
 		if (priv->have_static_opps)
 			dev_pm_opp_of_cpumask_remove_table(priv->cpus);
-		dev_pm_opp_put_regulators(priv->opp_table);
+		dev_pm_opp_put_regulators(priv->opp_token);
 		free_cpumask_var(priv->cpus);
 		list_del(&priv->node);
 	}
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 541baff93ee8..d1235242367f 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -33,7 +33,7 @@ struct exynos_bus {
 
 	unsigned long curr_freq;
 
-	struct opp_table *opp_table;
+	int opp_token;
 	struct clk *clk;
 	unsigned int ratio;
 };
@@ -161,8 +161,7 @@ static void exynos_bus_exit(struct device *dev)
 
 	dev_pm_opp_of_remove_table(dev);
 	clk_disable_unprepare(bus->clk);
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 }
 
 static void exynos_bus_passive_exit(struct device *dev)
@@ -179,18 +178,16 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 					struct exynos_bus *bus)
 {
 	struct device *dev = bus->dev;
-	struct opp_table *opp_table;
 	const char *supplies[] = { "vdd", NULL };
 	int i, ret, count, size;
 
-	opp_table = dev_pm_opp_set_regulators(dev, supplies);
-	if (IS_ERR(opp_table)) {
-		ret = PTR_ERR(opp_table);
+	ret = dev_pm_opp_set_regulators(dev, supplies);
+	if (ret < 0) {
 		dev_err(dev, "failed to set regulators %d\n", ret);
 		return ret;
 	}
 
-	bus->opp_table = opp_table;
+	bus->opp_token = ret;
 
 	/*
 	 * Get the devfreq-event devices to get the current utilization of
@@ -236,8 +233,7 @@ static int exynos_bus_parent_parse_of(struct device_node *np,
 	return 0;
 
 err_regulator:
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 
 	return ret;
 }
@@ -459,8 +455,7 @@ err:
 	dev_pm_opp_of_remove_table(dev);
 	clk_disable_unprepare(bus->clk);
 err_reg:
-	dev_pm_opp_put_regulators(bus->opp_table);
-	bus->opp_table = NULL;
+	dev_pm_opp_put_regulators(bus->opp_token);
 
 	return ret;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 7ab20c3b91ed..6ff9b5b69d07 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -991,8 +991,8 @@ static int _set_opp_custom(const struct opp_table *opp_table,
 	int size;
 
 	/*
-	 * We support this only if dev_pm_opp_set_regulators() was called
-	 * earlier.
+	 * We support this only if dev_pm_opp_set_config() was called
+	 * earlier to set regulators.
 	 */
 	if (opp_table->sod_supplies) {
 		size = sizeof(*old_opp->supplies) * opp_table->regulator_count;
@@ -2097,7 +2097,7 @@ void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
 /**
- * dev_pm_opp_set_regulators() - Set regulator names for the device
+ * _opp_set_regulators() - Set regulator names for the device
  * @dev: Device for which regulator name is being set.
  * @names: Array of pointers to the names of the regulator.
  * @count: Number of regulators.
@@ -2108,12 +2108,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[])
+static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
+			       const char * const names[])
 {
 	struct dev_pm_opp_supply *supplies;
 	const char * const *temp = names;
-	struct opp_table *opp_table;
 	struct regulator *reg;
 	int count = 0, ret, i;
 
@@ -2122,29 +2121,17 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 		count++;
 
 	if (!count)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
+		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the regulators ? */
 	if (opp_table->regulators)
-		return opp_table;
+		return 0;
 
 	opp_table->regulators = kmalloc_array(count,
 					      sizeof(*opp_table->regulators),
 					      GFP_KERNEL);
-	if (!opp_table->regulators) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	if (!opp_table->regulators)
+		return -ENOMEM;
 
 	for (i = 0; i < count; i++) {
 		reg = regulator_get_optional(dev, names[i]);
@@ -2174,7 +2161,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	}
 	mutex_unlock(&opp_table->lock);
 
-	return opp_table;
+	return 0;
 
 free_regulators:
 	while (i != 0)
@@ -2183,26 +2170,20 @@ free_regulators:
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
-err:
-	dev_pm_opp_put_opp_table(opp_table);
 
-	return ERR_PTR(ret);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
 
 /**
- * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
- * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ * _opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from _opp_set_regulators().
  */
-void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+static void _opp_put_regulators(struct opp_table *opp_table)
 {
 	int i;
 
-	if (unlikely(!opp_table))
-		return;
-
 	if (!opp_table->regulators)
-		goto put_opp_table;
+		return;
 
 	if (opp_table->enabled) {
 		for (i = opp_table->regulator_count - 1; i >= 0; i--)
@@ -2225,40 +2206,7 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
-
-static void devm_pm_opp_regulators_release(void *data)
-{
-	dev_pm_opp_put_regulators(data);
-}
-
-/**
- * devm_pm_opp_set_regulators() - Set regulator names for the device
- * @dev: Device for which regulator name is being set.
- * @names: Array of pointers to the names of the regulator.
- * @count: Number of regulators.
- *
- * This is a resource-managed variant of dev_pm_opp_set_regulators().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_set_regulators(struct device *dev,
-			       const char * const names[])
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_regulators(dev, names);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_regulators_release,
-					opp_table);
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_regulators);
 
 /**
  * dev_pm_opp_set_clkname() - Set clk name for the device
@@ -2633,7 +2581,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_GENPD)
 		dev_pm_opp_detach_genpd(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR)
-		dev_pm_opp_put_regulators(data->opp_table);
+		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		dev_pm_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
@@ -2758,11 +2706,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure supplies */
 	if (config->regulator_names) {
-		err = dev_pm_opp_set_regulators(dev, config->regulator_names);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_regulators(opp_table, dev,
+					  config->regulator_names);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_REGULATOR;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index a08f9481efb3..f014bd172c99 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -189,9 +189,6 @@ void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
 int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[]);
-void dev_pm_opp_put_regulators(struct opp_table *opp_table);
-int devm_pm_opp_set_regulators(struct device *dev, const char * const names[]);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -409,19 +406,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[])
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_regulators(struct device *dev,
-					     const char * const names[])
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -606,4 +590,32 @@ static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_ta
 }
 #endif
 
+/* OPP Configuration helpers */
+
+/* Regulators helpers */
+static inline int dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[])
+{
+	struct dev_pm_opp_config config = {
+		.regulator_names = names,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_regulators(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_regulators(struct device *dev,
+					     const char * const names[])
+{
+	struct dev_pm_opp_config config = {
+		.regulator_names = names,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 89f03984fa2abface1ffb1fe050b7c175651ffc7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-supported-hw API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-supported-hw family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/imx-cpufreq-dt.c    | 12 ++---
 drivers/cpufreq/tegra20-cpufreq.c   | 12 +++--
 drivers/memory/tegra/tegra124-emc.c | 11 +++--
 drivers/opp/core.c                  | 87 +++++++++----------------------------
 include/linux/pm_opp.h              | 49 +++++++++++++--------
 5 files changed, 66 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/imx-cpufreq-dt.c b/drivers/cpufreq/imx-cpufreq-dt.c
index 3fe9125156b4..76e553af2071 100644
--- a/drivers/cpufreq/imx-cpufreq-dt.c
+++ b/drivers/cpufreq/imx-cpufreq-dt.c
@@ -31,8 +31,8 @@
 
 /* cpufreq-dt device registered by imx-cpufreq-dt */
 static struct platform_device *cpufreq_dt_pdev;
-static struct opp_table *cpufreq_opp_table;
 static struct device *cpu_dev;
+static int cpufreq_opp_token;
 
 enum IMX7ULP_CPUFREQ_CLKS {
 	ARM,
@@ -153,9 +153,9 @@ static int imx_cpufreq_dt_probe(struct platform_device *pdev)
 	dev_info(&pdev->dev, "cpu speed grade %d mkt segment %d supported-hw %#x %#x\n",
 			speed_grade, mkt_segment, supported_hw[0], supported_hw[1]);
 
-	cpufreq_opp_table = dev_pm_opp_set_supported_hw(cpu_dev, supported_hw, 2);
-	if (IS_ERR(cpufreq_opp_table)) {
-		ret = PTR_ERR(cpufreq_opp_table);
+	cpufreq_opp_token = dev_pm_opp_set_supported_hw(cpu_dev, supported_hw, 2);
+	if (cpufreq_opp_token < 0) {
+		ret = cpufreq_opp_token;
 		dev_err(&pdev->dev, "Failed to set supported opp: %d\n", ret);
 		return ret;
 	}
@@ -163,7 +163,7 @@ static int imx_cpufreq_dt_probe(struct platform_device *pdev)
 	cpufreq_dt_pdev = platform_device_register_data(
 			&pdev->dev, "cpufreq-dt", -1, NULL, 0);
 	if (IS_ERR(cpufreq_dt_pdev)) {
-		dev_pm_opp_put_supported_hw(cpufreq_opp_table);
+		dev_pm_opp_put_supported_hw(cpufreq_opp_token);
 		ret = PTR_ERR(cpufreq_dt_pdev);
 		dev_err(&pdev->dev, "Failed to register cpufreq-dt: %d\n", ret);
 		return ret;
@@ -176,7 +176,7 @@ static int imx_cpufreq_dt_remove(struct platform_device *pdev)
 {
 	platform_device_unregister(cpufreq_dt_pdev);
 	if (!of_machine_is_compatible("fsl,imx7ulp"))
-		dev_pm_opp_put_supported_hw(cpufreq_opp_table);
+		dev_pm_opp_put_supported_hw(cpufreq_opp_token);
 	else
 		clk_bulk_put(ARRAY_SIZE(imx7ulp_clks), imx7ulp_clks);
 
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index e8db3d75be25..ab7ac7df9e62 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -32,9 +32,9 @@ static bool cpu0_node_has_opp_v2_prop(void)
 	return ret;
 }
 
-static void tegra20_cpufreq_put_supported_hw(void *opp_table)
+static void tegra20_cpufreq_put_supported_hw(void *opp_token)
 {
-	dev_pm_opp_put_supported_hw(opp_table);
+	dev_pm_opp_put_supported_hw((unsigned long) opp_token);
 }
 
 static void tegra20_cpufreq_dt_unregister(void *cpufreq_dt)
@@ -45,7 +45,6 @@ static void tegra20_cpufreq_dt_unregister(void *cpufreq_dt)
 static int tegra20_cpufreq_probe(struct platform_device *pdev)
 {
 	struct platform_device *cpufreq_dt;
-	struct opp_table *opp_table;
 	struct device *cpu_dev;
 	u32 versions[2];
 	int err;
@@ -71,16 +70,15 @@ static int tegra20_cpufreq_probe(struct platform_device *pdev)
 	if (WARN_ON(!cpu_dev))
 		return -ENODEV;
 
-	opp_table = dev_pm_opp_set_supported_hw(cpu_dev, versions, 2);
-	err = PTR_ERR_OR_ZERO(opp_table);
-	if (err) {
+	err = dev_pm_opp_set_supported_hw(cpu_dev, versions, 2);
+	if (err < 0) {
 		dev_err(&pdev->dev, "failed to set supported hw: %d\n", err);
 		return err;
 	}
 
 	err = devm_add_action_or_reset(&pdev->dev,
 				       tegra20_cpufreq_put_supported_hw,
-				       opp_table);
+				       (void *)((unsigned long) err));
 	if (err)
 		return err;
 
diff --git a/drivers/memory/tegra/tegra124-emc.c b/drivers/memory/tegra/tegra124-emc.c
index 908f8d5392b2..85bc936c02f9 100644
--- a/drivers/memory/tegra/tegra124-emc.c
+++ b/drivers/memory/tegra/tegra124-emc.c
@@ -1395,15 +1395,14 @@ err_msg:
 static int tegra_emc_opp_table_init(struct tegra_emc *emc)
 {
 	u32 hw_version = BIT(tegra_sku_info.soc_speedo_id);
-	struct opp_table *hw_opp_table;
-	int err;
+	int opp_token, err;
 
-	hw_opp_table = dev_pm_opp_set_supported_hw(emc->dev, &hw_version, 1);
-	err = PTR_ERR_OR_ZERO(hw_opp_table);
-	if (err) {
+	err = dev_pm_opp_set_supported_hw(emc->dev, &hw_version, 1);
+	if (err < 0) {
 		dev_err(emc->dev, "failed to set OPP supported HW: %d\n", err);
 		return err;
 	}
+	opp_token = err;
 
 	err = dev_pm_opp_of_add_table(emc->dev);
 	if (err) {
@@ -1430,7 +1429,7 @@ static int tegra_emc_opp_table_init(struct tegra_emc *emc)
 remove_table:
 	dev_pm_opp_of_remove_table(emc->dev);
 put_hw_table:
-	dev_pm_opp_put_supported_hw(hw_opp_table);
+	dev_pm_opp_put_supported_hw(opp_token);
 
 	return err;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 6ff9b5b69d07..8dbdfff38973 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1952,7 +1952,7 @@ free_opp:
 }
 
 /**
- * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * _opp_set_supported_hw() - Set supported platforms
  * @dev: Device for which supported-hw has to be set.
  * @versions: Array of hierarchy of versions to match.
  * @count: Number of elements in the array.
@@ -1962,84 +1962,39 @@ free_opp:
  * OPPs, which are available for those versions, based on its 'opp-supported-hw'
  * property.
  */
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-			const u32 *versions, unsigned int count)
+static int _opp_set_supported_hw(struct opp_table *opp_table,
+				 const u32 *versions, unsigned int count)
 {
-	struct opp_table *opp_table;
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
 	/* Another CPU that shares the OPP table has set the property ? */
 	if (opp_table->supported_hw)
-		return opp_table;
+		return 0;
 
 	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
 					GFP_KERNEL);
-	if (!opp_table->supported_hw) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!opp_table->supported_hw)
+		return -ENOMEM;
 
 	opp_table->supported_hw_count = count;
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
 
 /**
- * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
+ * _opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @opp_table: OPP table returned by _opp_set_supported_hw().
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * _opp_set_supported_hw(). Until this is called, the opp_table structure
  * will not be freed.
  */
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
+static void _opp_put_supported_hw(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	kfree(opp_table->supported_hw);
-	opp_table->supported_hw = NULL;
-	opp_table->supported_hw_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
-
-static void devm_pm_opp_supported_hw_release(void *data)
-{
-	dev_pm_opp_put_supported_hw(data);
-}
-
-/**
- * devm_pm_opp_set_supported_hw() - Set supported platforms
- * @dev: Device for which supported-hw has to be set.
- * @versions: Array of hierarchy of versions to match.
- * @count: Number of elements in the array.
- *
- * This is a resource-managed variant of dev_pm_opp_set_supported_hw().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
-				 unsigned int count)
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_supported_hw(dev, versions, count);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_supported_hw_release,
-					opp_table);
+	if (opp_table->supported_hw) {
+		kfree(opp_table->supported_hw);
+		opp_table->supported_hw = NULL;
+		opp_table->supported_hw_count = 0;
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_supported_hw);
 
 /**
  * dev_pm_opp_set_prop_name() - Set prop-extn name
@@ -2583,7 +2538,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
-		dev_pm_opp_put_supported_hw(data->opp_table);
+		_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
@@ -2694,12 +2649,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure supported hardware */
 	if (config->supported_hw) {
-		err = dev_pm_opp_set_supported_hw(dev, config->supported_hw,
-						  config->supported_hw_count);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_supported_hw(opp_table, config->supported_hw,
+					    config->supported_hw_count);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_SUPPORTED_HW;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f014bd172c99..94d0101c254c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -184,9 +184,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
 
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
-int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -369,22 +366,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-							    const u32 *versions,
-							    unsigned int count)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_supported_hw(struct device *dev,
-					       const u32 *versions,
-					       unsigned int count)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
@@ -618,4 +599,34 @@ static inline int devm_pm_opp_set_regulators(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* Supported-hw helpers */
+static inline int dev_pm_opp_set_supported_hw(struct device *dev,
+					      const u32 *versions,
+					      unsigned int count)
+{
+	struct dev_pm_opp_config config = {
+		.supported_hw = versions,
+		.supported_hw_count = count,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_supported_hw(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_supported_hw(struct device *dev,
+					       const u32 *versions,
+					       unsigned int count)
+{
+	struct dev_pm_opp_config config = {
+		.supported_hw = versions,
+		.supported_hw_count = count,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 2368f57685768f9f9cd666eaa4194a359d89afb8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-clk-name API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-clk-name family of helpers to use the new
infrastructure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 142 +++++++++++++++----------------------------------
 include/linux/pm_opp.h |  41 ++++++++------
 2 files changed, 69 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 8dbdfff38973..0a82ca7ae453 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2164,104 +2164,71 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * In order to support OPP switching, OPP layer needs to get pointer to the
- * clock for the device. Simple cases work fine without using this routine (i.e.
- * by passing connection-id as NULL), but for a device with multiple clocks
- * available, the OPP core needs to know the exact name of the clk to use.
+ * _opp_set_clknames() - Set clk names for the device
+ * @dev: Device for which clk names is being set.
+ * @names: Clk names.
+ *
+ * In order to support OPP switching, OPP layer needs to get pointers to the
+ * clocks for the device. Simple cases work fine without using this routine
+ * (i.e. by passing connection-id as NULL), but for a device with multiple
+ * clocks available, the OPP core needs to know the exact names of the clks to
+ * use.
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
+static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
+			     const char * const names[])
 {
-	struct opp_table *opp_table;
-	int ret;
+	const char * const *temp = names;
+	int count = 0;
 
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
+	/* Count number of clks */
+	while (*temp++)
+		count++;
 
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
+	/*
+	 * This is a special case where we have a single clock, whose connection
+	 * id name is NULL, i.e. first two entries are NULL in the array.
+	 */
+	if (!count && !names[1])
+		count = 1;
+
+	/* We support only one clock name for now */
+	if (count != 1)
+		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the clkname ? */
 	if (opp_table->clk_configured)
-		return opp_table;
+		return 0;
 
 	/* clk shouldn't be initialized at this point */
-	if (WARN_ON(opp_table->clk)) {
-		ret = -EBUSY;
-		goto err;
-	}
+	if (WARN_ON(opp_table->clk))
+		return -EBUSY;
 
 	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, name);
+	opp_table->clk = clk_get(dev, names[0]);
 	if (IS_ERR(opp_table->clk)) {
-		ret = dev_err_probe(dev, PTR_ERR(opp_table->clk),
+		return dev_err_probe(dev, PTR_ERR(opp_table->clk),
 				    "%s: Couldn't find clock\n", __func__);
-		goto err;
 	}
 
 	opp_table->clk_configured = true;
 
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
-
-/**
- * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
- * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
- */
-void dev_pm_opp_put_clkname(struct opp_table *opp_table)
-{
-	if (unlikely(!opp_table))
-		return;
-
-	clk_put(opp_table->clk);
-	opp_table->clk = ERR_PTR(-EINVAL);
-	opp_table->clk_configured = false;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
-
-static void devm_pm_opp_clkname_release(void *data)
-{
-	dev_pm_opp_put_clkname(data);
+	return 0;
 }
 
 /**
- * devm_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * This is a resource-managed variant of dev_pm_opp_set_clkname().
- *
- * Return: 0 on success and errorno otherwise.
+ * _opp_put_clknames() - Releases resources blocked for clks.
+ * @opp_table: OPP table returned from _opp_set_clknames().
  */
-int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+static void _opp_put_clknames(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_set_clkname(dev, name);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_clkname_release,
-					opp_table);
+	if (opp_table->clk_configured) {
+		clk_put(opp_table->clk);
+		opp_table->clk = ERR_PTR(-EINVAL);
+		opp_table->clk_configured = false;
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_set_clkname);
 
 /**
  * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
@@ -2544,7 +2511,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		dev_pm_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
-		dev_pm_opp_put_clkname(data->opp_table);
+		_opp_put_clknames(data->opp_table);
 
 	dev_pm_opp_put_opp_table(data->opp_table);
 	kfree(data);
@@ -2595,32 +2562,9 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure clocks */
 	if (config->clk_names) {
-		const char * const *temp = config->clk_names;
-		int count = 0;
-
-		/* Count number of clks */
-		while (*temp++)
-			count++;
-
-		/*
-		 * This is a special case where we have a single clock, whose
-		 * connection id name is NULL, i.e. first two entries are NULL
-		 * in the array.
-		 */
-		if (!count && !config->clk_names[1])
-			count = 1;
-
-		/* We support only one clock name for now */
-		if (count != 1) {
-			ret = -EINVAL;
-			goto err;
-		}
-
-		err = dev_pm_opp_set_clkname(dev, config->clk_names[0]);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_clknames(opp_table, dev, config->clk_names);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_CLK;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 94d0101c254c..ed1906bbe8bb 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
-void dev_pm_opp_put_clkname(struct opp_table *opp_table);
-int devm_pm_opp_set_clkname(struct device *dev, const char *name);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
@@ -387,18 +384,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -629,4 +614,30 @@ static inline int devm_pm_opp_set_supported_hw(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* clkname helpers */
+static inline int dev_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	const char *names[] = { name, NULL };
+	struct dev_pm_opp_config config = {
+		.clk_names = names,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_clkname(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	const char *names[] = { name, NULL };
+	struct dev_pm_opp_config config = {
+		.clk_names = names,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 3c543b42a6df35feb3db6689e512fa167739451e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-opp-helper API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-opp-helper family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Remove devm_pm_opp_register_set_opp_helper() as it has no users
currently.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c          | 89 ++++++++++-----------------------------------
 drivers/opp/ti-opp-supply.c |  6 +--
 include/linux/pm_opp.h      | 33 ++++++++---------
 3 files changed, 39 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 0a82ca7ae453..9da7dcf62cab 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2231,7 +2231,7 @@ static void _opp_put_clknames(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * _opp_register_set_opp_helper() - Register custom set OPP helper
  * @dev: Device for which the helper is getting registered.
  * @set_opp: Custom set OPP helper.
  *
@@ -2240,32 +2240,18 @@ static void _opp_put_clknames(struct opp_table *opp_table)
  *
  * This must be called before any OPPs are initialized for the device.
  */
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
+static int _opp_register_set_opp_helper(struct opp_table *opp_table,
+	struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
 	struct dev_pm_set_opp_data *data;
-	struct opp_table *opp_table;
-
-	if (!set_opp)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-EBUSY);
-	}
 
 	/* Another CPU that shares the OPP table has set the helper ? */
 	if (opp_table->set_opp)
-		return opp_table;
+		return 0;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	mutex_lock(&opp_table->lock);
 	opp_table->set_opp_data = data;
@@ -2278,60 +2264,26 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 
 	opp_table->set_opp = set_opp;
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
 
 /**
- * dev_pm_opp_unregister_set_opp_helper() - Releases resources blocked for
- *					   set_opp helper
- * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
+ * _opp_unregister_set_opp_helper() - Releases resources blocked for set_opp helper
+ * @opp_table: OPP table returned from _opp_register_set_opp_helper().
  *
  * Release resources blocked for platform specific set_opp helper.
  */
-void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table)
+static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	opp_table->set_opp = NULL;
-
-	mutex_lock(&opp_table->lock);
-	kfree(opp_table->set_opp_data);
-	opp_table->set_opp_data = NULL;
-	mutex_unlock(&opp_table->lock);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper);
-
-static void devm_pm_opp_unregister_set_opp_helper(void *data)
-{
-	dev_pm_opp_unregister_set_opp_helper(data);
-}
-
-/**
- * devm_pm_opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is a resource-managed version of dev_pm_opp_register_set_opp_helper().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_register_set_opp_helper(struct device *dev,
-					int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_register_set_opp_helper(dev, set_opp);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
+	if (opp_table->set_opp) {
+		opp_table->set_opp = NULL;
 
-	return devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper,
-					opp_table);
+		mutex_lock(&opp_table->lock);
+		kfree(opp_table->set_opp_data);
+		opp_table->set_opp_data = NULL;
+		mutex_unlock(&opp_table->lock);
+	}
 }
-EXPORT_SYMBOL_GPL(devm_pm_opp_register_set_opp_helper);
 
 static void _opp_detach_genpd(struct opp_table *opp_table)
 {
@@ -2507,7 +2459,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
-		dev_pm_opp_unregister_set_opp_helper(data->opp_table);
+		_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		dev_pm_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2582,11 +2534,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure opp helper */
 	if (config->set_opp) {
-		err = dev_pm_opp_register_set_opp_helper(dev, config->set_opp);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_register_set_opp_helper(opp_table, dev,
+						   config->set_opp);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
 	}
diff --git a/drivers/opp/ti-opp-supply.c b/drivers/opp/ti-opp-supply.c
index bd4771f388ab..40ebc9ac82dd 100644
--- a/drivers/opp/ti-opp-supply.c
+++ b/drivers/opp/ti-opp-supply.c
@@ -405,9 +405,9 @@ static int ti_opp_supply_probe(struct platform_device *pdev)
 			return ret;
 	}
 
-	ret = PTR_ERR_OR_ZERO(dev_pm_opp_register_set_opp_helper(cpu_dev,
-								 ti_opp_supply_set_opp));
-	if (ret)
+	ret = dev_pm_opp_register_set_opp_helper(cpu_dev,
+						 ti_opp_supply_set_opp);
+	if (ret < 0)
 		_free_optimized_voltages(dev, &opp_data);
 
 	return ret;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index ed1906bbe8bb..85a4b7353979 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
-void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
-int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
@@ -363,20 +360,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_register_set_opp_helper(struct device *dev,
-				    int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	return -EOPNOTSUPP;
-}
-
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -640,4 +623,20 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* set-opp helpers */
+static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct dev_pm_opp_config config = {
+		.set_opp = set_opp,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_unregister_set_opp_helper(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 442e7a1786e628b38175314ec1805966b8d0c02c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate attach-genpd API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the attach-genpd family of helpers to use the new
infrastructure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 85 +++++++++++---------------------------------------
 include/linux/pm_opp.h | 48 ++++++++++++++++++----------
 2 files changed, 49 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 9da7dcf62cab..458584994c2b 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2285,7 +2285,7 @@ static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 	}
 }
 
-static void _opp_detach_genpd(struct opp_table *opp_table)
+static void _detach_genpd(struct opp_table *opp_table)
 {
 	int index;
 
@@ -2305,7 +2305,7 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer
+ * _opp_attach_genpd - Attach genpd(s) for the device and save virtual device pointer
  * @dev: Consumer device for which the genpd is getting attached.
  * @names: Null terminated array of pointers containing names of genpd to attach.
  * @virt_devs: Pointer to return the array of virtual devices.
@@ -2326,30 +2326,23 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
  * The order of entries in the names array must match the order in which
  * "required-opps" are added in DT.
  */
-struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
-		const char * const *names, struct device ***virt_devs)
+static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
+			const char * const *names, struct device ***virt_devs)
 {
-	struct opp_table *opp_table;
 	struct device *virt_dev;
 	int index = 0, ret = -EINVAL;
 	const char * const *name = names;
 
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
 	if (opp_table->genpd_virt_devs)
-		return opp_table;
+		return 0;
 
 	/*
 	 * If the genpd's OPP table isn't already initialized, parsing of the
 	 * required-opps fail for dev. We should retry this after genpd's OPP
 	 * table is added.
 	 */
-	if (!opp_table->required_opp_count) {
-		ret = -EPROBE_DEFER;
-		goto put_table;
-	}
+	if (!opp_table->required_opp_count)
+		return -EPROBE_DEFER;
 
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
 
@@ -2382,78 +2375,38 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 		*virt_devs = opp_table->genpd_virt_devs;
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
 
-	return opp_table;
+	return 0;
 
 err:
-	_opp_detach_genpd(opp_table);
+	_detach_genpd(opp_table);
 unlock:
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
+	return ret;
 
-put_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_attach_genpd);
 
 /**
- * dev_pm_opp_detach_genpd() - Detach genpd(s) from the device.
- * @opp_table: OPP table returned by dev_pm_opp_attach_genpd().
+ * _opp_detach_genpd() - Detach genpd(s) from the device.
+ * @opp_table: OPP table returned by _opp_attach_genpd().
  *
  * This detaches the genpd(s), resets the virtual device pointers, and puts the
  * OPP table.
  */
-void dev_pm_opp_detach_genpd(struct opp_table *opp_table)
+static void _opp_detach_genpd(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
 	/*
 	 * Acquire genpd_virt_dev_lock to make sure virt_dev isn't getting
 	 * used in parallel.
 	 */
 	mutex_lock(&opp_table->genpd_virt_dev_lock);
-	_opp_detach_genpd(opp_table);
+	_detach_genpd(opp_table);
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
-
-	dev_pm_opp_put_opp_table(opp_table);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_detach_genpd);
-
-static void devm_pm_opp_detach_genpd(void *data)
-{
-	dev_pm_opp_detach_genpd(data);
-}
-
-/**
- * devm_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual
- *			      device pointer
- * @dev: Consumer device for which the genpd is getting attached.
- * @names: Null terminated array of pointers containing names of genpd to attach.
- * @virt_devs: Pointer to return the array of virtual devices.
- *
- * This is a resource-managed version of dev_pm_opp_attach_genpd().
- *
- * Return: 0 on success and errorno otherwise.
- */
-int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names,
-			     struct device ***virt_devs)
-{
-	struct opp_table *opp_table;
-
-	opp_table = dev_pm_opp_attach_genpd(dev, names, virt_devs);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	return devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd,
-					opp_table);
-}
-EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
 static void _opp_clear_config(struct opp_config_data *data)
 {
 	if (data->flags & OPP_CONFIG_GENPD)
-		dev_pm_opp_detach_genpd(data->opp_table);
+		_opp_detach_genpd(data->opp_table);
 	if (data->flags & OPP_CONFIG_REGULATOR)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
@@ -2564,12 +2517,10 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Attach genpds */
 	if (config->genpd_names) {
-		err = dev_pm_opp_attach_genpd(dev, config->genpd_names,
-					      config->virt_devs);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_attach_genpd(opp_table, dev, config->genpd_names,
+					config->virt_devs);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_GENPD;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 85a4b7353979..20e1e5060a8a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -186,9 +186,6 @@ void dev_pm_opp_clear_config(int token);
 
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
-struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
-void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
-int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -367,20 +364,6 @@ static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, con
 
 static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
 
-static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
-
-static inline int devm_pm_opp_attach_genpd(struct device *dev,
-					   const char * const *names,
-					   struct device ***virt_devs)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
 	return -EOPNOTSUPP;
@@ -639,4 +622,35 @@ static inline void dev_pm_opp_unregister_set_opp_helper(int token)
 	dev_pm_opp_clear_config(token);
 }
 
+/* genpd helpers */
+static inline int dev_pm_opp_attach_genpd(struct device *dev,
+					  const char * const *names,
+					  struct device ***virt_devs)
+{
+	struct dev_pm_opp_config config = {
+		.genpd_names = names,
+		.virt_devs = virt_devs,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_detach_genpd(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
+static inline int devm_pm_opp_attach_genpd(struct device *dev,
+					   const char * const *names,
+					   struct device ***virt_devs)
+{
+	struct dev_pm_opp_config config = {
+		.genpd_names = names,
+		.virt_devs = virt_devs,
+	};
+
+	return devm_pm_opp_set_config(dev, &config);
+}
+
+
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From 298098e55a6fcc176a5af52cd689f33577ead5ca Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 26 May 2022 09:36:27 +0530
Subject: OPP: Migrate set-prop-name helper API to use set-config helpers

Now that we have a central API to handle all OPP table configurations,
migrate the set-prop-name family of helpers to use the new
infrastructure.

The return type and parameter to the APIs change a bit due to this,
update the current users as well in the same commit in order to avoid
breaking builds.

Acked-by: Samuel Holland <samuel@sholland.org> # sun50i
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/sun50i-cpufreq-nvmem.c | 31 +++++++++----------
 drivers/opp/core.c                     | 55 +++++++++++-----------------------
 include/linux/pm_opp.h                 | 23 ++++++++------
 3 files changed, 46 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
index 75e1bf3a08f7..a4922580ce06 100644
--- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c
+++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
@@ -86,20 +86,20 @@ static int sun50i_cpufreq_get_efuse(u32 *versions)
 
 static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 {
-	struct opp_table **opp_tables;
+	int *opp_tokens;
 	char name[MAX_NAME_LEN];
 	unsigned int cpu;
 	u32 speed = 0;
 	int ret;
 
-	opp_tables = kcalloc(num_possible_cpus(), sizeof(*opp_tables),
+	opp_tokens = kcalloc(num_possible_cpus(), sizeof(*opp_tokens),
 			     GFP_KERNEL);
-	if (!opp_tables)
+	if (!opp_tokens)
 		return -ENOMEM;
 
 	ret = sun50i_cpufreq_get_efuse(&speed);
 	if (ret) {
-		kfree(opp_tables);
+		kfree(opp_tokens);
 		return ret;
 	}
 
@@ -113,9 +113,9 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 			goto free_opp;
 		}
 
-		opp_tables[cpu] = dev_pm_opp_set_prop_name(cpu_dev, name);
-		if (IS_ERR(opp_tables[cpu])) {
-			ret = PTR_ERR(opp_tables[cpu]);
+		opp_tokens[cpu] = dev_pm_opp_set_prop_name(cpu_dev, name);
+		if (opp_tokens[cpu] < 0) {
+			ret = opp_tokens[cpu];
 			pr_err("Failed to set prop name\n");
 			goto free_opp;
 		}
@@ -124,7 +124,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 	cpufreq_dt_pdev = platform_device_register_simple("cpufreq-dt", -1,
 							  NULL, 0);
 	if (!IS_ERR(cpufreq_dt_pdev)) {
-		platform_set_drvdata(pdev, opp_tables);
+		platform_set_drvdata(pdev, opp_tokens);
 		return 0;
 	}
 
@@ -132,27 +132,24 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev)
 	pr_err("Failed to register platform device\n");
 
 free_opp:
-	for_each_possible_cpu(cpu) {
-		if (IS_ERR_OR_NULL(opp_tables[cpu]))
-			break;
-		dev_pm_opp_put_prop_name(opp_tables[cpu]);
-	}
-	kfree(opp_tables);
+	for_each_possible_cpu(cpu)
+		dev_pm_opp_put_prop_name(opp_tokens[cpu]);
+	kfree(opp_tokens);
 
 	return ret;
 }
 
 static int sun50i_cpufreq_nvmem_remove(struct platform_device *pdev)
 {
-	struct opp_table **opp_tables = platform_get_drvdata(pdev);
+	int *opp_tokens = platform_get_drvdata(pdev);
 	unsigned int cpu;
 
 	platform_device_unregister(cpufreq_dt_pdev);
 
 	for_each_possible_cpu(cpu)
-		dev_pm_opp_put_prop_name(opp_tables[cpu]);
+		dev_pm_opp_put_prop_name(opp_tokens[cpu]);
 
-	kfree(opp_tables);
+	kfree(opp_tokens);
 
 	return 0;
 }
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 458584994c2b..1745e25c1eaf 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1997,7 +1997,7 @@ static void _opp_put_supported_hw(struct opp_table *opp_table)
 }
 
 /**
- * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * _opp_set_prop_name() - Set prop-extn name
  * @dev: Device for which the prop-name has to be set.
  * @name: name to postfix to properties.
  *
@@ -2006,50 +2006,33 @@ static void _opp_put_supported_hw(struct opp_table *opp_table)
  * which the extension will apply are opp-microvolt and opp-microamp. OPP core
  * should postfix the property name with -<name> while looking for them.
  */
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+static int _opp_set_prop_name(struct opp_table *opp_table, const char *name)
 {
-	struct opp_table *opp_table;
-
-	opp_table = _add_opp_table(dev, false);
-	if (IS_ERR(opp_table))
-		return opp_table;
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
 	/* Another CPU that shares the OPP table has set the property ? */
-	if (opp_table->prop_name)
-		return opp_table;
-
-	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
 	if (!opp_table->prop_name) {
-		dev_pm_opp_put_opp_table(opp_table);
-		return ERR_PTR(-ENOMEM);
+		opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+		if (!opp_table->prop_name)
+			return -ENOMEM;
 	}
 
-	return opp_table;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
 
 /**
- * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
+ * _opp_put_prop_name() - Releases resources blocked for prop-name
+ * @opp_table: OPP table returned by _opp_set_prop_name().
  *
  * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * _opp_set_prop_name(). Until this is called, the opp_table structure
  * will not be freed.
  */
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
+static void _opp_put_prop_name(struct opp_table *opp_table)
 {
-	if (unlikely(!opp_table))
-		return;
-
-	kfree(opp_table->prop_name);
-	opp_table->prop_name = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
+	if (opp_table->prop_name) {
+		kfree(opp_table->prop_name);
+		opp_table->prop_name = NULL;
+	}
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
 /**
  * _opp_set_regulators() - Set regulator names for the device
@@ -2414,7 +2397,7 @@ static void _opp_clear_config(struct opp_config_data *data)
 	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		_opp_unregister_set_opp_helper(data->opp_table);
 	if (data->flags & OPP_CONFIG_PROP_NAME)
-		dev_pm_opp_put_prop_name(data->opp_table);
+		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
 		_opp_put_clknames(data->opp_table);
 
@@ -2441,7 +2424,7 @@ static void _opp_clear_config(struct opp_config_data *data)
  */
 int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
-	struct opp_table *opp_table, *err;
+	struct opp_table *opp_table;
 	struct opp_config_data *data;
 	unsigned int id;
 	int ret;
@@ -2476,11 +2459,9 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure property names */
 	if (config->prop_name) {
-		err = dev_pm_opp_set_prop_name(dev, config->prop_name);
-		if (IS_ERR(err)) {
-			ret = PTR_ERR(err);
+		ret = _opp_set_prop_name(opp_table, config->prop_name);
+		if (ret)
 			goto err;
-		}
 
 		data->flags |= OPP_CONFIG_PROP_NAME;
 	}
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 20e1e5060a8a..f995ca1406e8 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -184,8 +184,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
 
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -357,13 +355,6 @@ static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct noti
 	return -EOPNOTSUPP;
 }
 
-static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {}
-
 static inline int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 {
 	return -EOPNOTSUPP;
@@ -652,5 +643,19 @@ static inline int devm_pm_opp_attach_genpd(struct device *dev,
 	return devm_pm_opp_set_config(dev, &config);
 }
 
+/* prop-name helpers */
+static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct dev_pm_opp_config config = {
+		.prop_name = name,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_prop_name(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
 
 #endif		/* __LINUX_OPP_H__ */
-- 
cgit v1.2.3


From aee3352f6ecf8cfad1f1ee5838cfc4d37c6b8f75 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 13:45:08 +0530
Subject: OPP: Add support for config_regulators() helper

Extend the dev_pm_opp_set_config() interface to allow adding
config_regulators() helpers. This helper will be called to set the
voltages of the regulators from the regular path in _set_opp(), while we
are trying to change the OPP.

This will eventually replace the custom set_opp() helper.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h      |  2 ++
 include/linux/pm_opp.h | 22 ++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 1745e25c1eaf..12bae79564f1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1185,6 +1185,17 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 			dev_err(dev, "Failed to set bw: %d\n", ret);
 			return ret;
 		}
+
+		if (opp_table->config_regulators) {
+			ret = opp_table->config_regulators(dev, old_opp, opp,
+							   opp_table->regulators,
+							   opp_table->regulator_count);
+			if (ret) {
+				dev_err(dev, "Failed to set regulator voltages: %d\n",
+					ret);
+				return ret;
+			}
+		}
 	}
 
 	if (opp_table->set_opp) {
@@ -1202,6 +1213,17 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 
 	/* Scaling down? Configure required OPPs after frequency */
 	if (scaling_down) {
+		if (opp_table->config_regulators) {
+			ret = opp_table->config_regulators(dev, old_opp, opp,
+							   opp_table->regulators,
+							   opp_table->regulator_count);
+			if (ret) {
+				dev_err(dev, "Failed to set regulator voltages: %d\n",
+					ret);
+				return ret;
+			}
+		}
+
 		ret = _set_opp_bw(opp_table, opp, dev);
 		if (ret) {
 			dev_err(dev, "Failed to set bw: %d\n", ret);
@@ -2268,6 +2290,38 @@ static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
 	}
 }
 
+/**
+ * _opp_set_config_regulators_helper() - Register custom set regulator helper.
+ * @dev: Device for which the helper is getting registered.
+ * @config_regulators: Custom set regulator helper.
+ *
+ * This is useful to support platforms with multiple regulators per device.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+static int _opp_set_config_regulators_helper(struct opp_table *opp_table,
+		struct device *dev, config_regulators_t config_regulators)
+{
+	/* Another CPU that shares the OPP table has set the helper ? */
+	if (!opp_table->config_regulators)
+		opp_table->config_regulators = config_regulators;
+
+	return 0;
+}
+
+/**
+ * _opp_put_config_regulators_helper() - Releases resources blocked for
+ *					 config_regulators helper.
+ * @opp_table: OPP table returned from _opp_set_config_regulators_helper().
+ *
+ * Release resources blocked for platform specific config_regulators helper.
+ */
+static void _opp_put_config_regulators_helper(struct opp_table *opp_table)
+{
+	if (opp_table->config_regulators)
+		opp_table->config_regulators = NULL;
+}
+
 static void _detach_genpd(struct opp_table *opp_table)
 {
 	int index;
@@ -2394,8 +2448,10 @@ static void _opp_clear_config(struct opp_config_data *data)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
-	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER) {
+		_opp_put_config_regulators_helper(data->opp_table);
 		_opp_unregister_set_opp_helper(data->opp_table);
+	}
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2476,6 +2532,16 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
 	}
 
+	/* Configure config_regulators helper */
+	if (config->config_regulators) {
+		ret = _opp_set_config_regulators_helper(opp_table, dev,
+						config->config_regulators);
+		if (ret)
+			goto err;
+
+		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
+	}
+
 	/* Configure supported hardware */
 	if (config->supported_hw) {
 		ret = _opp_set_supported_hw(opp_table, config->supported_hw,
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index d652f0cc84f1..45fd40737159 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -172,6 +172,7 @@ enum opp_table_access {
  * @prop_name: A name to postfix to many DT properties, while parsing them.
  * @clk_configured: Clock name is configured by the platform.
  * @clk: Device's clock handle
+ * @config_regulators: Platform specific config_regulators() callback.
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
  * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt
@@ -224,6 +225,7 @@ struct opp_table {
 	const char *prop_name;
 	bool clk_configured;
 	struct clk *clk;
+	config_regulators_t config_regulators;
 	struct regulator **regulators;
 	int regulator_count;
 	struct icc_path **paths;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f995ca1406e8..9f2f9a792a19 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -90,11 +90,16 @@ struct dev_pm_set_opp_data {
 	struct device *dev;
 };
 
+typedef int (*config_regulators_t)(struct device *dev,
+			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
+			struct regulator **regulators, unsigned int count);
+
 /**
  * struct dev_pm_opp_config - Device OPP configuration values
  * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
  * @prop_name: Name to postfix to properties.
  * @set_opp: Custom set OPP helper.
+ * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
  * @supported_hw_count: Number of elements in the array.
  * @regulator_names: Array of pointers to the names of the regulator, NULL terminated.
@@ -109,6 +114,7 @@ struct dev_pm_opp_config {
 	const char * const *clk_names;
 	const char *prop_name;
 	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char * const *regulator_names;
@@ -613,6 +619,22 @@ static inline void dev_pm_opp_unregister_set_opp_helper(int token)
 	dev_pm_opp_clear_config(token);
 }
 
+/* config-regulators helpers */
+static inline int dev_pm_opp_set_config_regulators(struct device *dev,
+						   config_regulators_t helper)
+{
+	struct dev_pm_opp_config config = {
+		.config_regulators = helper,
+	};
+
+	return dev_pm_opp_set_config(dev, &config);
+}
+
+static inline void dev_pm_opp_put_config_regulators(int token)
+{
+	dev_pm_opp_clear_config(token);
+}
+
 /* genpd helpers */
 static inline int dev_pm_opp_attach_genpd(struct device *dev,
 					  const char * const *names,
-- 
cgit v1.2.3


From 69b1af178a3a534da2f20fa0c7356fcbbe8cce1f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 31 May 2022 13:24:21 +0530
Subject: OPP: Add dev_pm_opp_get_supplies()

We already have an API for getting voltage information for a single
regulator, dev_pm_opp_get_voltage(), but there is nothing available for
multiple regulator case.

This patch adds a new API, dev_pm_opp_get_supplies(), to get all
information related to the supplies for an OPP.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 25 +++++++++++++++++++++++++
 include/linux/pm_opp.h |  7 +++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 5ecf077b6b17..3a794bff2ba1 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -117,6 +117,31 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
 
+/**
+ * dev_pm_opp_get_supplies() - Gets the supply information corresponding to an opp
+ * @opp:	opp for which voltage has to be returned for
+ * @supplies:	Placeholder for copying the supply information.
+ *
+ * Return: negative error number on failure, 0 otherwise on success after
+ * setting @supplies.
+ *
+ * This can be used for devices with any number of power supplies. The caller
+ * must ensure the @supplies array must contain space for each regulator.
+ */
+int dev_pm_opp_get_supplies(struct dev_pm_opp *opp,
+			    struct dev_pm_opp_supply *supplies)
+{
+	if (IS_ERR_OR_NULL(opp) || !supplies) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return -EINVAL;
+	}
+
+	memcpy(supplies, opp->supplies,
+	       sizeof(*supplies) * opp->opp_table->regulator_count);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_supplies);
+
 /**
  * dev_pm_opp_get_power() - Gets the power corresponding to an opp
  * @opp:	opp for which power has to be returned for
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9f2f9a792a19..1e2b33d79ba6 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -129,6 +129,8 @@ void dev_pm_opp_put_opp_table(struct opp_table *opp_table);
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 
+int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies);
+
 unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp);
 
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
@@ -217,6 +219,11 @@ static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	return 0;
 }
 
+static inline int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp)
 {
 	return 0;
-- 
cgit v1.2.3


From 1f378c6ead5c69decf36e8e61de2e50377c76ab8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 4 Jul 2022 14:57:06 +0530
Subject: OPP: Remove custom OPP helper support

The only user of the custom helper is migrated to use
dev_pm_opp_set_config_regulators() interface. Remove the now unused
custom OPP helper support.

This cleans up _set_opp() and leaves a single code path to be used by
all users.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 132 +------------------------------------------------
 drivers/opp/opp.h      |   7 ---
 include/linux/pm_opp.h |  51 -------------------
 3 files changed, 2 insertions(+), 188 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 3a794bff2ba1..e74bdc134c5a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -979,36 +979,6 @@ static int _set_opp_bw(const struct opp_table *opp_table,
 	return 0;
 }
 
-static int _set_opp_custom(const struct opp_table *opp_table,
-			   struct device *dev, struct dev_pm_opp *opp,
-			   unsigned long freq)
-{
-	struct dev_pm_set_opp_data *data = opp_table->set_opp_data;
-	struct dev_pm_opp *old_opp = opp_table->current_opp;
-	int size;
-
-	/*
-	 * We support this only if dev_pm_opp_set_config() was called
-	 * earlier to set regulators.
-	 */
-	if (opp_table->sod_supplies) {
-		size = sizeof(*old_opp->supplies) * opp_table->regulator_count;
-		memcpy(data->old_opp.supplies, old_opp->supplies, size);
-		memcpy(data->new_opp.supplies, opp->supplies, size);
-		data->regulator_count = opp_table->regulator_count;
-	} else {
-		data->regulator_count = 0;
-	}
-
-	data->regulators = opp_table->regulators;
-	data->clk = opp_table->clk;
-	data->dev = dev;
-	data->old_opp.rate = old_opp->rate;
-	data->new_opp.rate = freq;
-
-	return opp_table->set_opp(data);
-}
-
 static int _set_required_opp(struct device *dev, struct device *pd_dev,
 			     struct dev_pm_opp *opp, int i)
 {
@@ -1195,13 +1165,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 		}
 	}
 
-	if (opp_table->set_opp) {
-		ret = _set_opp_custom(opp_table, dev, opp, freq);
-	} else {
-		/* Only frequency scaling */
-		ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
-	}
-
+	ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq);
 	if (ret)
 		return ret;
 
@@ -2065,7 +2029,6 @@ static void _opp_put_prop_name(struct opp_table *opp_table)
 static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
 			       const char * const names[])
 {
-	struct dev_pm_opp_supply *supplies;
 	const char * const *temp = names;
 	struct regulator *reg;
 	int count = 0, ret, i;
@@ -2101,20 +2064,6 @@ static int _opp_set_regulators(struct opp_table *opp_table, struct device *dev,
 
 	opp_table->regulator_count = count;
 
-	supplies = kmalloc_array(count * 2, sizeof(*supplies), GFP_KERNEL);
-	if (!supplies) {
-		ret = -ENOMEM;
-		goto free_regulators;
-	}
-
-	mutex_lock(&opp_table->lock);
-	opp_table->sod_supplies = supplies;
-	if (opp_table->set_opp_data) {
-		opp_table->set_opp_data->old_opp.supplies = supplies;
-		opp_table->set_opp_data->new_opp.supplies = supplies + count;
-	}
-	mutex_unlock(&opp_table->lock);
-
 	/* Set generic config_regulators() for single regulators here */
 	if (count == 1)
 		opp_table->config_regulators = _opp_config_regulator_single;
@@ -2151,16 +2100,6 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 	for (i = opp_table->regulator_count - 1; i >= 0; i--)
 		regulator_put(opp_table->regulators[i]);
 
-	mutex_lock(&opp_table->lock);
-	if (opp_table->set_opp_data) {
-		opp_table->set_opp_data->old_opp.supplies = NULL;
-		opp_table->set_opp_data->new_opp.supplies = NULL;
-	}
-
-	kfree(opp_table->sod_supplies);
-	opp_table->sod_supplies = NULL;
-	mutex_unlock(&opp_table->lock);
-
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = -1;
@@ -2233,61 +2172,6 @@ static void _opp_put_clknames(struct opp_table *opp_table)
 	}
 }
 
-/**
- * _opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is useful to support complex platforms (like platforms with multiple
- * regulators per device), instead of the generic OPP set rate helper.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-static int _opp_register_set_opp_helper(struct opp_table *opp_table,
-	struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct dev_pm_set_opp_data *data;
-
-	/* Another CPU that shares the OPP table has set the helper ? */
-	if (opp_table->set_opp)
-		return 0;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	mutex_lock(&opp_table->lock);
-	opp_table->set_opp_data = data;
-	if (opp_table->sod_supplies) {
-		data->old_opp.supplies = opp_table->sod_supplies;
-		data->new_opp.supplies = opp_table->sod_supplies +
-					 opp_table->regulator_count;
-	}
-	mutex_unlock(&opp_table->lock);
-
-	opp_table->set_opp = set_opp;
-
-	return 0;
-}
-
-/**
- * _opp_unregister_set_opp_helper() - Releases resources blocked for set_opp helper
- * @opp_table: OPP table returned from _opp_register_set_opp_helper().
- *
- * Release resources blocked for platform specific set_opp helper.
- */
-static void _opp_unregister_set_opp_helper(struct opp_table *opp_table)
-{
-	if (opp_table->set_opp) {
-		opp_table->set_opp = NULL;
-
-		mutex_lock(&opp_table->lock);
-		kfree(opp_table->set_opp_data);
-		opp_table->set_opp_data = NULL;
-		mutex_unlock(&opp_table->lock);
-	}
-}
-
 /**
  * _opp_set_config_regulators_helper() - Register custom set regulator helper.
  * @dev: Device for which the helper is getting registered.
@@ -2446,10 +2330,8 @@ static void _opp_clear_config(struct opp_config_data *data)
 		_opp_put_regulators(data->opp_table);
 	if (data->flags & OPP_CONFIG_SUPPORTED_HW)
 		_opp_put_supported_hw(data->opp_table);
-	if (data->flags & OPP_CONFIG_REGULATOR_HELPER) {
+	if (data->flags & OPP_CONFIG_REGULATOR_HELPER)
 		_opp_put_config_regulators_helper(data->opp_table);
-		_opp_unregister_set_opp_helper(data->opp_table);
-	}
 	if (data->flags & OPP_CONFIG_PROP_NAME)
 		_opp_put_prop_name(data->opp_table);
 	if (data->flags & OPP_CONFIG_CLK)
@@ -2520,16 +2402,6 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 		data->flags |= OPP_CONFIG_PROP_NAME;
 	}
 
-	/* Configure opp helper */
-	if (config->set_opp) {
-		ret = _opp_register_set_opp_helper(opp_table, dev,
-						   config->set_opp);
-		if (ret)
-			goto err;
-
-		data->flags |= OPP_CONFIG_REGULATOR_HELPER;
-	}
-
 	/* Configure config_regulators helper */
 	if (config->config_regulators) {
 		ret = _opp_set_config_regulators_helper(opp_table, dev,
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 45fd40737159..13abe991e811 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -182,9 +182,6 @@ enum opp_table_access {
  * @enabled: Set to true if the device's resources are enabled/configured.
  * @genpd_performance_state: Device's power domain support performance state.
  * @is_genpd: Marks if the OPP table belongs to a genpd.
- * @set_opp: Platform specific set_opp callback
- * @sod_supplies: Set opp data supplies
- * @set_opp_data: Data to be passed to set_opp callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -234,10 +231,6 @@ struct opp_table {
 	bool genpd_performance_state;
 	bool is_genpd;
 
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
-	struct dev_pm_opp_supply *sod_supplies;
-	struct dev_pm_set_opp_data *set_opp_data;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
 	char dentry_name[NAME_MAX];
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 1e2b33d79ba6..9d59aedc2be3 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -57,39 +57,6 @@ struct dev_pm_opp_icc_bw {
 	u32 peak;
 };
 
-/**
- * struct dev_pm_opp_info - OPP freq/voltage/current values
- * @rate:	Target clk rate in hz
- * @supplies:	Array of voltage/current values for all power supplies
- *
- * This structure stores the freq/voltage/current values for a single OPP.
- */
-struct dev_pm_opp_info {
-	unsigned long rate;
-	struct dev_pm_opp_supply *supplies;
-};
-
-/**
- * struct dev_pm_set_opp_data - Set OPP data
- * @old_opp:	Old OPP info
- * @new_opp:	New OPP info
- * @regulators:	Array of regulator pointers
- * @regulator_count: Number of regulators
- * @clk:	Pointer to clk
- * @dev:	Pointer to the struct device
- *
- * This structure contains all information required for setting an OPP.
- */
-struct dev_pm_set_opp_data {
-	struct dev_pm_opp_info old_opp;
-	struct dev_pm_opp_info new_opp;
-
-	struct regulator **regulators;
-	unsigned int regulator_count;
-	struct clk *clk;
-	struct device *dev;
-};
-
 typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
@@ -98,7 +65,6 @@ typedef int (*config_regulators_t)(struct device *dev,
  * struct dev_pm_opp_config - Device OPP configuration values
  * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
  * @prop_name: Name to postfix to properties.
- * @set_opp: Custom set OPP helper.
  * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
  * @supported_hw_count: Number of elements in the array.
@@ -113,7 +79,6 @@ struct dev_pm_opp_config {
 	/* NULL terminated */
 	const char * const *clk_names;
 	const char *prop_name;
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
 	unsigned int supported_hw_count;
@@ -610,22 +575,6 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
 	return devm_pm_opp_set_config(dev, &config);
 }
 
-/* set-opp helpers */
-static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct dev_pm_opp_config config = {
-		.set_opp = set_opp,
-	};
-
-	return dev_pm_opp_set_config(dev, &config);
-}
-
-static inline void dev_pm_opp_unregister_set_opp_helper(int token)
-{
-	dev_pm_opp_clear_config(token);
-}
-
 /* config-regulators helpers */
 static inline int dev_pm_opp_set_config_regulators(struct device *dev,
 						   config_regulators_t helper)
-- 
cgit v1.2.3


From 9fbb62605607d8f00c32a4765cd1ae67f022b640 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 2 Jun 2022 14:05:59 +0530
Subject: OPP: Remove dev_pm_opp_find_freq_ceil_by_volt()

This was added few years back, but the code that was supposed to use it
never got merged. Remove the unused helper.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 54 --------------------------------------------------
 include/linux/pm_opp.h |  8 --------
 2 files changed, 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e74bdc134c5a..fc0232f695c6 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -607,60 +607,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
 
-/**
- * dev_pm_opp_find_freq_ceil_by_volt() - Find OPP with highest frequency for
- *					 target voltage.
- * @dev:	Device for which we do this operation.
- * @u_volt:	Target voltage.
- *
- * Search for OPP with highest (ceil) frequency and has voltage <= u_volt.
- *
- * Return: matching *opp, else returns ERR_PTR in case of error which should be
- * handled using IS_ERR.
- *
- * Error return values can be:
- * EINVAL:	bad parameters
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-						     unsigned long u_volt)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	if (!dev || !u_volt) {
-		dev_err(dev, "%s: Invalid argument volt=%lu\n", __func__,
-			u_volt);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available) {
-			if (temp_opp->supplies[0].u_volt > u_volt)
-				break;
-			opp = temp_opp;
-		}
-	}
-
-	/* Increment the reference count of OPP */
-	if (!IS_ERR(opp))
-		dev_pm_opp_get(opp);
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil_by_volt);
-
 /**
  * dev_pm_opp_find_level_exact() - search for an exact level
  * @dev:		device for which we do this operation
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9d59aedc2be3..50cbc75bef71 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -118,8 +118,6 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
 					      bool available);
 struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 					      unsigned long *freq);
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-						     unsigned long u_volt);
 
 struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev,
 					       unsigned int level);
@@ -265,12 +263,6 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev,
-					unsigned long u_volt)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
 					unsigned long *freq)
 {
-- 
cgit v1.2.3


From 0139da50dc53f0ce2804e83566d290c7e626fd17 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Mon, 4 Jul 2022 12:45:14 +0300
Subject: serial: Embed rs485_supported to uart_port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Embed rs485_supported to uart_port to allow serial core to tweak it as
needed.

Reviewed-by: Lino Sanfilippo <l.sanfilippo@kunbus.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20220704094515.6831-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_bcm2835aux.c | 2 +-
 drivers/tty/serial/8250/8250_dwlib.c      | 4 ++--
 drivers/tty/serial/8250/8250_exar.c       | 2 +-
 drivers/tty/serial/8250/8250_fintek.c     | 4 ++--
 drivers/tty/serial/8250/8250_lpc18xx.c    | 2 +-
 drivers/tty/serial/8250/8250_of.c         | 2 +-
 drivers/tty/serial/8250/8250_pci.c        | 2 +-
 drivers/tty/serial/amba-pl011.c           | 2 +-
 drivers/tty/serial/ar933x_uart.c          | 4 ++--
 drivers/tty/serial/atmel_serial.c         | 2 +-
 drivers/tty/serial/fsl_lpuart.c           | 2 +-
 drivers/tty/serial/imx.c                  | 4 ++--
 drivers/tty/serial/max310x.c              | 2 +-
 drivers/tty/serial/mcf.c                  | 4 ++--
 drivers/tty/serial/omap-serial.c          | 2 +-
 drivers/tty/serial/sc16is7xx.c            | 2 +-
 drivers/tty/serial/serial_core.c          | 8 ++++----
 drivers/tty/serial/stm32-usart.c          | 2 +-
 include/linux/serial_core.h               | 2 +-
 19 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_bcm2835aux.c b/drivers/tty/serial/8250/8250_bcm2835aux.c
index d9f1e618cfbd..047e14ccb165 100644
--- a/drivers/tty/serial/8250/8250_bcm2835aux.c
+++ b/drivers/tty/serial/8250/8250_bcm2835aux.c
@@ -108,7 +108,7 @@ static int bcm2835aux_serial_probe(struct platform_device *pdev)
 	up.port.flags = UPF_SHARE_IRQ | UPF_FIXED_PORT | UPF_FIXED_TYPE |
 			UPF_SKIP_TEST | UPF_IOREMAP;
 	up.port.rs485_config = serial8250_em485_config;
-	up.port.rs485_supported = &serial8250_em485_supported;
+	up.port.rs485_supported = serial8250_em485_supported;
 	up.rs485_start_tx = bcm2835aux_rs485_start_tx;
 	up.rs485_stop_tx = bcm2835aux_rs485_stop_tx;
 
diff --git a/drivers/tty/serial/8250/8250_dwlib.c b/drivers/tty/serial/8250/8250_dwlib.c
index a8bbed74ea70..2c3b1468bd88 100644
--- a/drivers/tty/serial/8250/8250_dwlib.c
+++ b/drivers/tty/serial/8250/8250_dwlib.c
@@ -255,10 +255,10 @@ void dw8250_setup_port(struct uart_port *p)
 	if (pd->hw_rs485_support) {
 		p->rs485_config = dw8250_rs485_config;
 		up->lsr_save_mask = LSR_SAVE_FLAGS | DW_UART_LSR_ADDR_RCVD;
-		p->rs485_supported = &dw8250_rs485_supported;
+		p->rs485_supported = dw8250_rs485_supported;
 	} else {
 		p->rs485_config = serial8250_em485_config;
-		p->rs485_supported = &serial8250_em485_supported;
+		p->rs485_supported = serial8250_em485_supported;
 		up->rs485_start_tx = serial8250_em485_start_tx;
 		up->rs485_stop_tx = serial8250_em485_stop_tx;
 	}
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index f5344cfe303c..314a05e009df 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -550,7 +550,7 @@ pci_xr17v35x_setup(struct exar8250 *priv, struct pci_dev *pcidev,
 
 	port->port.uartclk = baud * 16;
 	port->port.rs485_config = platform->rs485_config;
-	port->port.rs485_supported = platform->rs485_supported;
+	port->port.rs485_supported = *(platform->rs485_supported);
 
 	/*
 	 * Setup the UART clock for the devices on expansion slot to
diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c
index eea693f5b577..65b6b3cbaff6 100644
--- a/drivers/tty/serial/8250/8250_fintek.c
+++ b/drivers/tty/serial/8250/8250_fintek.c
@@ -433,9 +433,9 @@ static void fintek_8250_set_rs485_handler(struct uart_8250_port *uart)
 	case CHIP_ID_F81865:
 		uart->port.rs485_config = fintek_8250_rs485_config;
 		if (!pdata->index)
-			uart->port.rs485_supported = &fintek_8250_rs485_supported_port0;
+			uart->port.rs485_supported = fintek_8250_rs485_supported_port0;
 		else
-			uart->port.rs485_supported = &fintek_8250_rs485_supported;
+			uart->port.rs485_supported = fintek_8250_rs485_supported;
 		break;
 
 	default: /* No RS485 Auto direction functional */
diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c
index d7cb3bb52069..d6ca0d47e9d5 100644
--- a/drivers/tty/serial/8250/8250_lpc18xx.c
+++ b/drivers/tty/serial/8250/8250_lpc18xx.c
@@ -161,7 +161,7 @@ static int lpc18xx_serial_probe(struct platform_device *pdev)
 	uart.port.uartclk = clk_get_rate(data->clk_uart);
 	uart.port.private_data = data;
 	uart.port.rs485_config = lpc18xx_rs485_config;
-	uart.port.rs485_supported = &lpc18xx_rs485_supported;
+	uart.port.rs485_supported = lpc18xx_rs485_supported;
 	uart.port.serial_out = lpc18xx_uart_serial_out;
 
 	uart.dma = &data->dma;
diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index 65cccd559db2..1b461fba15a3 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -165,7 +165,7 @@ static int of_platform_serial_setup(struct platform_device *ofdev,
 
 	port->dev = &ofdev->dev;
 	port->rs485_config = serial8250_em485_config;
-	port->rs485_supported = &serial8250_em485_supported;
+	port->rs485_supported = serial8250_em485_supported;
 	up->rs485_start_tx = serial8250_em485_start_tx;
 	up->rs485_stop_tx = serial8250_em485_stop_tx;
 
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index d31d2350a9db..8a39ae072c65 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1607,7 +1607,7 @@ static int pci_fintek_setup(struct serial_private *priv,
 	port->port.iotype = UPIO_PORT;
 	port->port.iobase = iobase;
 	port->port.rs485_config = pci_fintek_rs485_config;
-	port->port.rs485_supported = &pci_fintek_rs485_supported;
+	port->port.rs485_supported = pci_fintek_rs485_supported;
 
 	data = devm_kzalloc(&pdev->dev, sizeof(u8), GFP_KERNEL);
 	if (!data)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index c8f52945a4aa..abeceeefdece 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2779,7 +2779,7 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 	uap->port.irq = dev->irq[0];
 	uap->port.ops = &amba_pl011_pops;
 	uap->port.rs485_config = pl011_rs485_config;
-	uap->port.rs485_supported = &pl011_rs485_supported;
+	uap->port.rs485_supported = pl011_rs485_supported;
 	snprintf(uap->type, sizeof(uap->type), "PL011 rev%u", amba_rev(dev));
 
 	ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr);
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index b73ce13683db..f931ecbc0bc0 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -778,7 +778,7 @@ static int ar933x_uart_probe(struct platform_device *pdev)
 	port->fifosize = AR933X_UART_FIFO_SIZE;
 	port->ops = &ar933x_uart_ops;
 	port->rs485_config = ar933x_config_rs485;
-	port->rs485_supported = &ar933x_rs485_supported;
+	port->rs485_supported = ar933x_rs485_supported;
 
 	baud = ar933x_uart_get_baud(port->uartclk, AR933X_UART_MAX_SCALE, 1);
 	up->min_baud = max_t(unsigned int, baud, AR933X_UART_MIN_BAUD);
@@ -802,7 +802,7 @@ static int ar933x_uart_probe(struct platform_device *pdev)
 	    !up->rts_gpiod) {
 		dev_err(&pdev->dev, "lacking rts-gpio, disabling RS485\n");
 		port->rs485.flags &= ~SER_RS485_ENABLED;
-		port->rs485_supported = &ar933x_no_rs485;
+		port->rs485_supported = ar933x_no_rs485;
 	}
 
 #ifdef CONFIG_SERIAL_AR933X_CONSOLE
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index bc6004679585..30ba9eef7b39 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -2498,7 +2498,7 @@ static int atmel_init_port(struct atmel_uart_port *atmel_port,
 	port->mapbase		= mpdev->resource[0].start;
 	port->irq		= platform_get_irq(mpdev, 0);
 	port->rs485_config	= atmel_config_rs485;
-	port->rs485_supported	= &atmel_rs485_supported;
+	port->rs485_supported	= atmel_rs485_supported;
 	port->iso7816_config	= atmel_config_iso7816;
 	port->membase		= NULL;
 
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 8fe0494d4057..fc7d235a1e27 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2655,7 +2655,7 @@ static int lpuart_probe(struct platform_device *pdev)
 		sport->port.rs485_config = lpuart32_config_rs485;
 	else
 		sport->port.rs485_config = lpuart_config_rs485;
-	sport->port.rs485_supported = &lpuart_rs485_supported;
+	sport->port.rs485_supported = lpuart_rs485_supported;
 
 	sport->ipg_clk = devm_clk_get(&pdev->dev, "ipg");
 	if (IS_ERR(sport->ipg_clk)) {
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 3457006cea3f..522445a8f666 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -2285,9 +2285,9 @@ static int imx_uart_probe(struct platform_device *pdev)
 	sport->port.rs485_config = imx_uart_rs485_config;
 	/* RTS is required to control the RS485 transmitter */
 	if (sport->have_rtscts || sport->have_rtsgpio)
-		sport->port.rs485_supported = &imx_rs485_supported;
+		sport->port.rs485_supported = imx_rs485_supported;
 	else
-		sport->port.rs485_supported = &imx_no_rs485;
+		sport->port.rs485_supported = imx_no_rs485;
 	sport->port.flags = UPF_BOOT_AUTOCONF;
 	timer_setup(&sport->timer, imx_uart_timeout, 0);
 
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index e162bfb44080..ab10ca4a45b5 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1370,7 +1370,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
 		s->p[i].port.membase	= (void __iomem *)~0;
 		s->p[i].port.uartclk	= uartclk;
 		s->p[i].port.rs485_config = max310x_rs485_config;
-		s->p[i].port.rs485_supported = &max310x_rs485_supported;
+		s->p[i].port.rs485_supported = max310x_rs485_supported;
 		s->p[i].port.ops	= &max310x_ops;
 		s->p[i].regmap		= regmaps[i];
 
diff --git a/drivers/tty/serial/mcf.c b/drivers/tty/serial/mcf.c
index 73c5287b8e5e..f4aaaadd0742 100644
--- a/drivers/tty/serial/mcf.c
+++ b/drivers/tty/serial/mcf.c
@@ -506,7 +506,7 @@ int __init early_mcf_setup(struct mcf_platform_uart *platp)
 		port->uartclk = MCF_BUSCLK;
 		port->flags = UPF_BOOT_AUTOCONF;
 		port->rs485_config = mcf_config_rs485;
-		port->rs485_supported = &mcf_rs485_supported;
+		port->rs485_supported = mcf_rs485_supported;
 		port->ops = &mcf_uart_ops;
 	}
 
@@ -634,7 +634,7 @@ static int mcf_probe(struct platform_device *pdev)
 		port->ops = &mcf_uart_ops;
 		port->flags = UPF_BOOT_AUTOCONF;
 		port->rs485_config = mcf_config_rs485;
-		port->rs485_supported = &mcf_rs485_supported;
+		port->rs485_supported = mcf_rs485_supported;
 		port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_MCF_CONSOLE);
 
 		uart_add_one_port(&mcf_driver, port);
diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
index 196bae704f85..0aa666e247d5 100644
--- a/drivers/tty/serial/omap-serial.c
+++ b/drivers/tty/serial/omap-serial.c
@@ -1643,7 +1643,7 @@ static int serial_omap_probe(struct platform_device *pdev)
 	up->port.flags = omap_up_info->flags;
 	up->port.uartclk = omap_up_info->uartclk;
 	up->port.rs485_config = serial_omap_config_rs485;
-	up->port.rs485_supported = &serial_omap_rs485_supported;
+	up->port.rs485_supported = serial_omap_rs485_supported;
 	if (!up->port.uartclk) {
 		up->port.uartclk = DEFAULT_CLK_SPEED;
 		dev_warn(&pdev->dev,
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 8cb92a3b3fb8..259e08cc347c 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1461,7 +1461,7 @@ static int sc16is7xx_probe(struct device *dev,
 		s->p[i].port.iotype	= UPIO_PORT;
 		s->p[i].port.uartclk	= freq;
 		s->p[i].port.rs485_config = sc16is7xx_config_rs485;
-		s->p[i].port.rs485_supported = &sc16is7xx_rs485_supported;
+		s->p[i].port.rs485_supported = sc16is7xx_rs485_supported;
 		s->p[i].port.ops	= &sc16is7xx_ops;
 		s->p[i].old_mctrl	= 0;
 		s->p[i].port.line	= sc16is7xx_alloc_line();
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 85ef7ef00b82..a9cf1044a9fa 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1285,7 +1285,7 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 	 * For any bit outside of the legacy ones that is not supported by
 	 * the driver, return -EINVAL.
 	 */
-	if (flags & ~port->rs485_supported->flags)
+	if (flags & ~port->rs485_supported.flags)
 		return -EINVAL;
 
 	/* Asking for address w/o addressing mode? */
@@ -1304,7 +1304,7 @@ static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *r
 
 static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs485 *rs485)
 {
-	u32 supported_flags = port->rs485_supported->flags;
+	u32 supported_flags = port->rs485_supported.flags;
 
 	if (!(rs485->flags & SER_RS485_ENABLED)) {
 		memset(rs485, 0, sizeof(*rs485));
@@ -1323,7 +1323,7 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 		supported_flags |= SER_RS485_RTS_ON_SEND|SER_RS485_RTS_AFTER_SEND;
 	}
 
-	if (!port->rs485_supported->delay_rts_before_send) {
+	if (!port->rs485_supported.delay_rts_before_send) {
 		if (rs485->delay_rts_before_send) {
 			dev_warn_ratelimited(port->dev,
 				"%s (%d): RTS delay before sending not supported\n",
@@ -1337,7 +1337,7 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
 			port->name, port->line, rs485->delay_rts_before_send);
 	}
 
-	if (!port->rs485_supported->delay_rts_after_send) {
+	if (!port->rs485_supported.delay_rts_after_send) {
 		if (rs485->delay_rts_after_send) {
 			dev_warn_ratelimited(port->dev,
 				"%s (%d): RTS delay after sending not supported\n",
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index 13992e64a7df..ff5c7e0ebc4c 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -1401,7 +1401,7 @@ static int stm32_usart_init_port(struct stm32_port *stm32port,
 	port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_STM32_CONSOLE);
 	port->irq = irq;
 	port->rs485_config = stm32_usart_config_rs485;
-	port->rs485_supported = &stm32_rs485_supported;
+	port->rs485_supported = stm32_rs485_supported;
 
 	ret = stm32_usart_init_rs485(port, pdev);
 	if (ret)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index b7b86ee3cb12..a6fa7c40c330 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -255,7 +255,7 @@ struct uart_port {
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
-	const struct serial_rs485	*rs485_supported;	/* Supported mask for serial_rs485 */
+	struct serial_rs485	rs485_supported;	/* Supported mask for serial_rs485 */
 	struct gpio_desc	*rs485_term_gpio;	/* enable RS485 bus termination */
 	struct serial_iso7816   iso7816;
 	void			*private_data;		/* generic platform data pointer */
-- 
cgit v1.2.3


From 91f059c95c6a5dbc0907a5f871e7915a5e93c1f9 Mon Sep 17 00:00:00 2001
From: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Date: Fri, 27 May 2022 23:23:52 +0530
Subject: mmc: core: Capture eMMC and SD card errors

Add changes to capture eMMC and SD card errors.
This is useful for debug and testing.

Signed-off-by: Liangliang Lu <quic_luliang@quicinc.com>
Signed-off-by: Sayali Lokhande <quic_sayalil@quicinc.com>
Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Signed-off-by: Ram Prakash Gupta <quic_rampraka@quicinc.com>
Signed-off-by: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/1653674036-21829-2-git-send-email-quic_c_sbhanu@quicinc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 10 +++++++++-
 include/linux/mmc/host.h | 26 ++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 4b70cbfc6d5d..ef53a2578824 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -943,9 +943,11 @@ int mmc_execute_tuning(struct mmc_card *card)
 	}
 
 	/* Only print error when we don't check for card removal */
-	if (!host->detect_change)
+	if (!host->detect_change) {
 		pr_err("%s: tuning execution failed: %d\n",
 			mmc_hostname(host), err);
+		mmc_debugfs_err_stats_inc(host, MMC_ERR_TUNING);
+	}
 
 	return err;
 }
@@ -2244,6 +2246,12 @@ void mmc_rescan(struct work_struct *work)
 		if (freqs[i] <= host->f_min)
 			break;
 	}
+
+	/*
+	 * Ignore the command timeout errors observed during
+	 * the card init as those are excepted.
+	 */
+	host->err_stats[MMC_ERR_CMD_TIMEOUT] = 0;
 	mmc_release_host(host);
 
  out:
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index c193c50ccd78..eb8bc5b9b0b7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -93,6 +93,25 @@ struct mmc_clk_phase_map {
 
 struct mmc_host;
 
+enum mmc_err_stat {
+	MMC_ERR_CMD_TIMEOUT,
+	MMC_ERR_CMD_CRC,
+	MMC_ERR_DAT_TIMEOUT,
+	MMC_ERR_DAT_CRC,
+	MMC_ERR_AUTO_CMD,
+	MMC_ERR_ADMA,
+	MMC_ERR_TUNING,
+	MMC_ERR_CMDQ_RED,
+	MMC_ERR_CMDQ_GCE,
+	MMC_ERR_CMDQ_ICCE,
+	MMC_ERR_REQ_TIMEOUT,
+	MMC_ERR_CMDQ_REQ_TIMEOUT,
+	MMC_ERR_ICE_CFG,
+	MMC_ERR_CTRL_TIMEOUT,
+	MMC_ERR_UNEXPECTED_IRQ,
+	MMC_ERR_MAX,
+};
+
 struct mmc_host_ops {
 	/*
 	 * It is optional for the host to implement pre_req and post_req in
@@ -501,6 +520,7 @@ struct mmc_host {
 	/* Host Software Queue support */
 	bool			hsq_enabled;
 
+	u32			err_stats[MMC_ERR_MAX];
 	unsigned long		private[] ____cacheline_aligned;
 };
 
@@ -635,6 +655,12 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data)
 	return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 }
 
+static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host,
+					     enum mmc_err_stat stat)
+{
+	host->err_stats[stat] += 1;
+}
+
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode);
 int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd);
-- 
cgit v1.2.3


From efe8f5c9b5e118070f424205078ababc46fd130a Mon Sep 17 00:00:00 2001
From: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Date: Fri, 27 May 2022 23:23:53 +0530
Subject: mmc: sdhci: Capture eMMC and SD card errors

Add changes to capture eMMC and SD card errors.
This is useful for debug and testing.

Signed-off-by: Liangliang Lu <quic_luliang@quicinc.com>
Signed-off-by: Sayali Lokhande <quic_sayalil@quicinc.com>
Signed-off-by: Bao D. Nguyen <quic_nguyenb@quicinc.com>
Signed-off-by: Shaik Sajida Bhanu <quic_c_sbhanu@quicinc.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/1653674036-21829-3-git-send-email-quic_c_sbhanu@quicinc.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c | 59 ++++++++++++++++++++++++++++++++++++------------
 drivers/mmc/host/sdhci.h |  3 +++
 include/linux/mmc/mmc.h  |  6 +++++
 3 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 22152029e14c..7689ffec5ad1 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -224,6 +224,7 @@ void sdhci_reset(struct sdhci_host *host, u8 mask)
 		if (timedout) {
 			pr_err("%s: Reset 0x%x never completed.\n",
 				mmc_hostname(host->mmc), (int)mask);
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			return;
 		}
@@ -1716,6 +1717,7 @@ static bool sdhci_send_command_retry(struct sdhci_host *host,
 		if (!timeout--) {
 			pr_err("%s: Controller never released inhibit bit(s).\n",
 			       mmc_hostname(host->mmc));
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			cmd->error = -EIO;
 			return false;
@@ -1965,6 +1967,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk)
 		if (timedout) {
 			pr_err("%s: Internal clock never stabilised.\n",
 			       mmc_hostname(host->mmc));
+			sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 			sdhci_dumpregs(host);
 			return;
 		}
@@ -1987,6 +1990,7 @@ void sdhci_enable_clk(struct sdhci_host *host, u16 clk)
 			if (timedout) {
 				pr_err("%s: PLL clock never stabilised.\n",
 				       mmc_hostname(host->mmc));
+				sdhci_err_stats_inc(host, CTRL_TIMEOUT);
 				sdhci_dumpregs(host);
 				return;
 			}
@@ -3161,6 +3165,7 @@ static void sdhci_timeout_timer(struct timer_list *t)
 	if (host->cmd && !sdhci_data_line_cmd(host->cmd)) {
 		pr_err("%s: Timeout waiting for hardware cmd interrupt.\n",
 		       mmc_hostname(host->mmc));
+		sdhci_err_stats_inc(host, REQ_TIMEOUT);
 		sdhci_dumpregs(host);
 
 		host->cmd->error = -ETIMEDOUT;
@@ -3183,6 +3188,7 @@ static void sdhci_timeout_data_timer(struct timer_list *t)
 	    (host->cmd && sdhci_data_line_cmd(host->cmd))) {
 		pr_err("%s: Timeout waiting for hardware interrupt.\n",
 		       mmc_hostname(host->mmc));
+		sdhci_err_stats_inc(host, REQ_TIMEOUT);
 		sdhci_dumpregs(host);
 
 		if (host->data) {
@@ -3234,17 +3240,21 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p)
 			return;
 		pr_err("%s: Got command interrupt 0x%08x even though no command operation was in progress.\n",
 		       mmc_hostname(host->mmc), (unsigned)intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 		return;
 	}
 
 	if (intmask & (SDHCI_INT_TIMEOUT | SDHCI_INT_CRC |
 		       SDHCI_INT_END_BIT | SDHCI_INT_INDEX)) {
-		if (intmask & SDHCI_INT_TIMEOUT)
+		if (intmask & SDHCI_INT_TIMEOUT) {
 			host->cmd->error = -ETIMEDOUT;
-		else
+			sdhci_err_stats_inc(host, CMD_TIMEOUT);
+		} else {
 			host->cmd->error = -EILSEQ;
-
+			if (!mmc_op_tuning(host->cmd->opcode))
+				sdhci_err_stats_inc(host, CMD_CRC);
+		}
 		/* Treat data command CRC error the same as data CRC error */
 		if (host->cmd->data &&
 		    (intmask & (SDHCI_INT_CRC | SDHCI_INT_TIMEOUT)) ==
@@ -3266,6 +3276,8 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p)
 			  -ETIMEDOUT :
 			  -EILSEQ;
 
+		sdhci_err_stats_inc(host, AUTO_CMD);
+
 		if (sdhci_auto_cmd23(host, mrq)) {
 			mrq->sbc->error = err;
 			__sdhci_finish_mrq(host, mrq);
@@ -3342,6 +3354,7 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 			if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 				host->data_cmd = NULL;
 				data_cmd->error = -ETIMEDOUT;
+				sdhci_err_stats_inc(host, CMD_TIMEOUT);
 				__sdhci_finish_mrq(host, data_cmd->mrq);
 				return;
 			}
@@ -3370,23 +3383,30 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
 
 		pr_err("%s: Got data interrupt 0x%08x even though no data operation was in progress.\n",
 		       mmc_hostname(host->mmc), (unsigned)intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 
 		return;
 	}
 
-	if (intmask & SDHCI_INT_DATA_TIMEOUT)
+	if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 		host->data->error = -ETIMEDOUT;
-	else if (intmask & SDHCI_INT_DATA_END_BIT)
+		sdhci_err_stats_inc(host, DAT_TIMEOUT);
+	} else if (intmask & SDHCI_INT_DATA_END_BIT) {
 		host->data->error = -EILSEQ;
-	else if ((intmask & SDHCI_INT_DATA_CRC) &&
+		if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if ((intmask & SDHCI_INT_DATA_CRC) &&
 		SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))
-			!= MMC_BUS_TEST_R)
+			!= MMC_BUS_TEST_R) {
 		host->data->error = -EILSEQ;
-	else if (intmask & SDHCI_INT_ADMA_ERROR) {
+		if (!mmc_op_tuning(SDHCI_GET_CMD(sdhci_readw(host, SDHCI_COMMAND))))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		pr_err("%s: ADMA error: 0x%08x\n", mmc_hostname(host->mmc),
 		       intmask);
 		sdhci_adma_show_error(host);
+		sdhci_err_stats_inc(host, ADMA);
 		host->data->error = -EIO;
 		if (host->ops->adma_workaround)
 			host->ops->adma_workaround(host, intmask);
@@ -3584,6 +3604,7 @@ out:
 	if (unexpected) {
 		pr_err("%s: Unexpected interrupt 0x%08x.\n",
 			   mmc_hostname(host->mmc), unexpected);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 	}
 
@@ -3905,20 +3926,27 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error,
 	if (!host->cqe_on)
 		return false;
 
-	if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC))
+	if (intmask & (SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC)) {
 		*cmd_error = -EILSEQ;
-	else if (intmask & SDHCI_INT_TIMEOUT)
+		if (!mmc_op_tuning(host->cmd->opcode))
+			sdhci_err_stats_inc(host, CMD_CRC);
+	} else if (intmask & SDHCI_INT_TIMEOUT) {
 		*cmd_error = -ETIMEDOUT;
-	else
+		sdhci_err_stats_inc(host, CMD_TIMEOUT);
+	} else
 		*cmd_error = 0;
 
-	if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC))
+	if (intmask & (SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC)) {
 		*data_error = -EILSEQ;
-	else if (intmask & SDHCI_INT_DATA_TIMEOUT)
+		if (!mmc_op_tuning(host->cmd->opcode))
+			sdhci_err_stats_inc(host, DAT_CRC);
+	} else if (intmask & SDHCI_INT_DATA_TIMEOUT) {
 		*data_error = -ETIMEDOUT;
-	else if (intmask & SDHCI_INT_ADMA_ERROR)
+		sdhci_err_stats_inc(host, DAT_TIMEOUT);
+	} else if (intmask & SDHCI_INT_ADMA_ERROR) {
 		*data_error = -EIO;
-	else
+		sdhci_err_stats_inc(host, ADMA);
+	} else
 		*data_error = 0;
 
 	/* Clear selected interrupts. */
@@ -3934,6 +3962,7 @@ bool sdhci_cqe_irq(struct sdhci_host *host, u32 intmask, int *cmd_error,
 		sdhci_writel(host, intmask, SDHCI_INT_STATUS);
 		pr_err("%s: CQE: Unexpected interrupt 0x%08x.\n",
 		       mmc_hostname(host->mmc), intmask);
+		sdhci_err_stats_inc(host, UNEXPECTED_IRQ);
 		sdhci_dumpregs(host);
 	}
 
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index d7929d725730..95a08f09df30 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -356,6 +356,9 @@ struct sdhci_adma2_64_desc {
  */
 #define MMC_CMD_TRANSFER_TIME	(10 * NSEC_PER_MSEC) /* max 10 ms */
 
+#define sdhci_err_stats_inc(host, err_name) \
+	mmc_debugfs_err_stats_inc((host)->mmc, MMC_ERR_##err_name)
+
 enum sdhci_cookie {
 	COOKIE_UNMAPPED,
 	COOKIE_PRE_MAPPED,	/* mapped by sdhci_pre_req() */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index d9a65c6a8816..9c50bc40f8ff 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -99,6 +99,12 @@ static inline bool mmc_op_multi(u32 opcode)
 	       opcode == MMC_READ_MULTIPLE_BLOCK;
 }
 
+static inline bool mmc_op_tuning(u32 opcode)
+{
+	return opcode == MMC_SEND_TUNING_BLOCK ||
+			opcode == MMC_SEND_TUNING_BLOCK_HS200;
+}
+
 /*
  * MMC_SWITCH argument format:
  *
-- 
cgit v1.2.3


From 2083da24eb56ce622332946800a67a7449d85fe5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Jun 2022 12:02:04 +0530
Subject: OPP: Allow multiple clocks for a device

This patch adds support to allow multiple clocks for a device.

The design is pretty much similar to how this is done for regulators,
and platforms can supply their own version of the config_clks() callback
if they have multiple clocks for their device. The core manages the
calls via opp_table->config_clks() eventually.

We have kept both "clk" and "clks" fields in the OPP table structure and
the reason is provided as a comment in _opp_set_clknames(). The same
isn't done for "rates" though and we use rates[0] at most of the places
now.

Co-developed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 213 +++++++++++++++++++++++++++++++++++--------------
 drivers/opp/debugfs.c  |  27 ++++++-
 drivers/opp/of.c       |  66 +++++++++++----
 drivers/opp/opp.h      |  16 ++--
 include/linux/pm_opp.h |   7 +-
 5 files changed, 246 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index daabc810a1f9..e3322d54942a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -181,7 +181,7 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 		return 0;
 	}
 
-	return opp->rate;
+	return opp->rates[0];
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
 
@@ -430,7 +430,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
 /* Helpers to read keys */
 static unsigned long _read_freq(struct dev_pm_opp *opp, int index)
 {
-	return opp->rate;
+	return opp->rates[0];
 }
 
 static unsigned long _read_level(struct dev_pm_opp *opp, int index)
@@ -784,22 +784,19 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 	return ret;
 }
 
-static inline int _generic_set_opp_clk_only(struct device *dev,
-		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data)
+static int
+_opp_config_clk_single(struct device *dev, struct opp_table *opp_table,
+		       struct dev_pm_opp *opp, void *data, bool scaling_down)
 {
 	unsigned long *target = data;
 	unsigned long freq;
 	int ret;
 
-	/* We may reach here for devices which don't change frequency */
-	if (IS_ERR(opp_table->clk))
-		return 0;
-
 	/* One of target and opp must be available */
 	if (target) {
 		freq = *target;
 	} else if (opp) {
-		freq = opp->rate;
+		freq = opp->rates[0];
 	} else {
 		WARN_ON(1);
 		return -EINVAL;
@@ -1025,11 +1022,11 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 	}
 
 	dev_dbg(dev, "%s: switching OPP: Freq %lu -> %lu Hz, Level %u -> %u, Bw %u -> %u\n",
-		__func__, old_opp->rate, opp->rate, old_opp->level, opp->level,
-		old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0,
+		__func__, old_opp->rates[0], opp->rates[0], old_opp->level,
+		opp->level, old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0,
 		opp->bandwidth ? opp->bandwidth[0].peak : 0);
 
-	scaling_down = _opp_compare_key(old_opp, opp);
+	scaling_down = _opp_compare_key(opp_table, old_opp, opp);
 	if (scaling_down == -1)
 		scaling_down = 0;
 
@@ -1059,9 +1056,11 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table,
 		}
 	}
 
-	ret = _generic_set_opp_clk_only(dev, opp_table, opp, clk_data);
-	if (ret)
-		return ret;
+	if (opp_table->config_clks) {
+		ret = opp_table->config_clks(dev, opp_table, opp, clk_data, scaling_down);
+		if (ret)
+			return ret;
+	}
 
 	/* Scaling down? Configure required OPPs after frequency */
 	if (scaling_down) {
@@ -1133,8 +1132,8 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		 * equivalent to a clk_set_rate()
 		 */
 		if (!_get_opp_count(opp_table)) {
-			ret = _generic_set_opp_clk_only(dev, opp_table, NULL,
-							&target_freq);
+			ret = opp_table->config_clks(dev, opp_table, NULL,
+						     &target_freq, false);
 			goto put_opp_table;
 		}
 
@@ -1255,6 +1254,8 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index)
 	INIT_LIST_HEAD(&opp_table->dev_list);
 	INIT_LIST_HEAD(&opp_table->lazy);
 
+	opp_table->clk = ERR_PTR(-ENODEV);
+
 	/* Mark regulator count uninitialized */
 	opp_table->regulator_count = -1;
 
@@ -1301,20 +1302,38 @@ static struct opp_table *_update_opp_table_clk(struct device *dev,
 	int ret;
 
 	/*
-	 * Return early if we don't need to get clk or we have already tried it
+	 * Return early if we don't need to get clk or we have already done it
 	 * earlier.
 	 */
-	if (!getclk || IS_ERR(opp_table) || opp_table->clk)
+	if (!getclk || IS_ERR(opp_table) || !IS_ERR(opp_table->clk) ||
+	    opp_table->clks)
 		return opp_table;
 
 	/* Find clk for the device */
 	opp_table->clk = clk_get(dev, NULL);
 
 	ret = PTR_ERR_OR_ZERO(opp_table->clk);
-	if (!ret)
+	if (!ret) {
+		opp_table->config_clks = _opp_config_clk_single;
+		opp_table->clk_count = 1;
 		return opp_table;
+	}
 
 	if (ret == -ENOENT) {
+		/*
+		 * There are few platforms which don't want the OPP core to
+		 * manage device's clock settings. In such cases neither the
+		 * platform provides the clks explicitly to us, nor the DT
+		 * contains a valid clk entry. The OPP nodes in DT may still
+		 * contain "opp-hz" property though, which we need to parse and
+		 * allow the platform to find an OPP based on freq later on.
+		 *
+		 * This is a simple solution to take care of such corner cases,
+		 * i.e. make the clk_count 1, which lets us allocate space for
+		 * frequency in opp->rates and also parse the entries in DT.
+		 */
+		opp_table->clk_count = 1;
+
 		dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret);
 		return opp_table;
 	}
@@ -1417,7 +1436,7 @@ static void _opp_table_kref_release(struct kref *kref)
 
 	_of_clear_opp_table(opp_table);
 
-	/* Release clk */
+	/* Release automatically acquired single clk */
 	if (!IS_ERR(opp_table->clk))
 		clk_put(opp_table->clk);
 
@@ -1505,7 +1524,7 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 	mutex_lock(&opp_table->lock);
 
 	list_for_each_entry(iter, &opp_table->opp_list, node) {
-		if (iter->rate == freq) {
+		if (iter->rates[0] == freq) {
 			opp = iter;
 			break;
 		}
@@ -1612,24 +1631,28 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_remove_all_dynamic);
 struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table)
 {
 	struct dev_pm_opp *opp;
-	int supply_count, supply_size, icc_size;
+	int supply_count, supply_size, icc_size, clk_size;
 
 	/* Allocate space for at least one supply */
 	supply_count = opp_table->regulator_count > 0 ?
 			opp_table->regulator_count : 1;
 	supply_size = sizeof(*opp->supplies) * supply_count;
+	clk_size = sizeof(*opp->rates) * opp_table->clk_count;
 	icc_size = sizeof(*opp->bandwidth) * opp_table->path_count;
 
 	/* allocate new OPP node and supplies structures */
-	opp = kzalloc(sizeof(*opp) + supply_size + icc_size, GFP_KERNEL);
-
+	opp = kzalloc(sizeof(*opp) + supply_size + clk_size + icc_size, GFP_KERNEL);
 	if (!opp)
 		return NULL;
 
-	/* Put the supplies at the end of the OPP structure as an empty array */
+	/* Put the supplies, bw and clock at the end of the OPP structure */
 	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+
+	opp->rates = (unsigned long *)(opp->supplies + supply_count);
+
 	if (icc_size)
-		opp->bandwidth = (struct dev_pm_opp_icc_bw *)(opp->supplies + supply_count);
+		opp->bandwidth = (struct dev_pm_opp_icc_bw *)(opp->rates + opp_table->clk_count);
+
 	INIT_LIST_HEAD(&opp->node);
 
 	return opp;
@@ -1660,21 +1683,43 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 	return true;
 }
 
+static int _opp_compare_rate(struct opp_table *opp_table,
+			     struct dev_pm_opp *opp1, struct dev_pm_opp *opp2)
+{
+	int i;
+
+	for (i = 0; i < opp_table->clk_count; i++) {
+		if (opp1->rates[i] != opp2->rates[i])
+			return opp1->rates[i] < opp2->rates[i] ? -1 : 1;
+	}
+
+	/* Same rates for both OPPs */
+	return 0;
+}
+
 /*
  * Returns
  * 0: opp1 == opp2
  * 1: opp1 > opp2
  * -1: opp1 < opp2
  */
-int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2)
+int _opp_compare_key(struct opp_table *opp_table, struct dev_pm_opp *opp1,
+		     struct dev_pm_opp *opp2)
 {
-	if (opp1->rate != opp2->rate)
-		return opp1->rate < opp2->rate ? -1 : 1;
+	int ret;
+
+	ret = _opp_compare_rate(opp_table, opp1, opp2);
+	if (ret)
+		return ret;
+
 	if (opp1->bandwidth && opp2->bandwidth &&
 	    opp1->bandwidth[0].peak != opp2->bandwidth[0].peak)
 		return opp1->bandwidth[0].peak < opp2->bandwidth[0].peak ? -1 : 1;
+
 	if (opp1->level != opp2->level)
 		return opp1->level < opp2->level ? -1 : 1;
+
+	/* Duplicate OPPs */
 	return 0;
 }
 
@@ -1694,7 +1739,7 @@ static int _opp_is_duplicate(struct device *dev, struct dev_pm_opp *new_opp,
 	 * loop.
 	 */
 	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		opp_cmp = _opp_compare_key(new_opp, opp);
+		opp_cmp = _opp_compare_key(opp_table, new_opp, opp);
 		if (opp_cmp > 0) {
 			*head = &opp->node;
 			continue;
@@ -1705,8 +1750,8 @@ static int _opp_is_duplicate(struct device *dev, struct dev_pm_opp *new_opp,
 
 		/* Duplicate OPPs */
 		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->supplies[0].u_volt,
-			 opp->available, new_opp->rate,
+			 __func__, opp->rates[0], opp->supplies[0].u_volt,
+			 opp->available, new_opp->rates[0],
 			 new_opp->supplies[0].u_volt, new_opp->available);
 
 		/* Should we compare voltages for all regulators here ? */
@@ -1727,7 +1772,7 @@ void _required_opps_available(struct dev_pm_opp *opp, int count)
 
 		opp->available = false;
 		pr_warn("%s: OPP not supported by required OPP %pOF (%lu)\n",
-			 __func__, opp->required_opps[i]->np, opp->rate);
+			 __func__, opp->required_opps[i]->np, opp->rates[0]);
 		return;
 	}
 }
@@ -1768,7 +1813,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
 		new_opp->available = false;
 		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
-			 __func__, new_opp->rate);
+			 __func__, new_opp->rates[0]);
 	}
 
 	/* required-opps not fully initialized yet */
@@ -1814,7 +1859,7 @@ int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
 		return -ENOMEM;
 
 	/* populate the opp table */
-	new_opp->rate = freq;
+	new_opp->rates[0] = freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
 	new_opp->supplies[0].u_volt = u_volt;
 	new_opp->supplies[0].u_volt_min = u_volt - tol;
@@ -2017,6 +2062,17 @@ static void _opp_put_regulators(struct opp_table *opp_table)
 	opp_table->regulator_count = -1;
 }
 
+static void _put_clks(struct opp_table *opp_table, int count)
+{
+	int i;
+
+	for (i = count - 1; i >= 0; i--)
+		clk_put(opp_table->clks[i]);
+
+	kfree(opp_table->clks);
+	opp_table->clks = NULL;
+}
+
 /**
  * _opp_set_clknames() - Set clk names for the device
  * @dev: Device for which clk names is being set.
@@ -2031,10 +2087,12 @@ static void _opp_put_regulators(struct opp_table *opp_table)
  * This must be called before any OPPs are initialized for the device.
  */
 static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
-			     const char * const names[])
+			     const char * const names[],
+			     config_clks_t config_clks)
 {
 	const char * const *temp = names;
-	int count = 0;
+	int count = 0, ret, i;
+	struct clk *clk;
 
 	/* Count number of clks */
 	while (*temp++)
@@ -2047,28 +2105,60 @@ static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
 	if (!count && !names[1])
 		count = 1;
 
-	/* We support only one clock name for now */
-	if (count != 1)
+	/* Fail early for invalid configurations */
+	if (!count || (config_clks && count == 1) || (!config_clks && count > 1))
 		return -EINVAL;
 
 	/* Another CPU that shares the OPP table has set the clkname ? */
-	if (opp_table->clk_configured)
+	if (opp_table->clks)
 		return 0;
 
-	/* clk shouldn't be initialized at this point */
-	if (WARN_ON(opp_table->clk))
-		return -EBUSY;
+	opp_table->clks = kmalloc_array(count, sizeof(*opp_table->clks),
+					GFP_KERNEL);
+	if (!opp_table->clks)
+		return -ENOMEM;
 
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, names[0]);
-	if (IS_ERR(opp_table->clk)) {
-		return dev_err_probe(dev, PTR_ERR(opp_table->clk),
-				    "%s: Couldn't find clock\n", __func__);
+	/* Find clks for the device */
+	for (i = 0; i < count; i++) {
+		clk = clk_get(dev, names[i]);
+		if (IS_ERR(clk)) {
+			ret = dev_err_probe(dev, PTR_ERR(clk),
+					    "%s: Couldn't find clock with name: %s\n",
+					    __func__, names[i]);
+			goto free_clks;
+		}
+
+		opp_table->clks[i] = clk;
 	}
 
-	opp_table->clk_configured = true;
+	opp_table->clk_count = count;
+
+	/* Set generic single clk set here */
+	if (count == 1) {
+		opp_table->config_clks = _opp_config_clk_single;
+
+		/*
+		 * We could have just dropped the "clk" field and used "clks"
+		 * everywhere. Instead we kept the "clk" field around for
+		 * following reasons:
+		 *
+		 * - avoiding clks[0] everywhere else.
+		 * - not running single clk helpers for multiple clk usecase by
+		 *   mistake.
+		 *
+		 * Since this is single-clk case, just update the clk pointer
+		 * too.
+		 */
+		opp_table->clk = opp_table->clks[0];
+	} else {
+		opp_table->config_clks = config_clks;
+	}
 
 	return 0;
+
+free_clks:
+	_put_clks(opp_table, i);
+	return ret;
 }
 
 /**
@@ -2077,11 +2167,13 @@ static int _opp_set_clknames(struct opp_table *opp_table, struct device *dev,
  */
 static void _opp_put_clknames(struct opp_table *opp_table)
 {
-	if (opp_table->clk_configured) {
-		clk_put(opp_table->clk);
-		opp_table->clk = ERR_PTR(-EINVAL);
-		opp_table->clk_configured = false;
-	}
+	if (!opp_table->clks)
+		return;
+
+	opp_table->config_clks = NULL;
+	opp_table->clk = ERR_PTR(-ENODEV);
+
+	_put_clks(opp_table, opp_table->clk_count);
 }
 
 /**
@@ -2298,11 +2390,16 @@ int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config)
 
 	/* Configure clocks */
 	if (config->clk_names) {
-		ret = _opp_set_clknames(opp_table, dev, config->clk_names);
+		ret = _opp_set_clknames(opp_table, dev, config->clk_names,
+					config->config_clks);
 		if (ret)
 			goto err;
 
 		data->flags |= OPP_CONFIG_CLK;
+	} else if (config->config_clks) {
+		/* Don't allow config callback without clocks */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	/* Configure property names */
@@ -2614,7 +2711,7 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
 
 	/* Do we have the frequency? */
 	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
+		if (tmp_opp->rates[0] == freq) {
 			opp = tmp_opp;
 			break;
 		}
@@ -2685,7 +2782,7 @@ int dev_pm_opp_adjust_voltage(struct device *dev, unsigned long freq,
 
 	/* Do we have the frequency? */
 	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
+		if (tmp_opp->rates[0] == freq) {
 			opp = tmp_opp;
 			break;
 		}
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index 1b6e5c55c3ed..96a30a032c5f 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -74,6 +74,24 @@ static void opp_debug_create_bw(struct dev_pm_opp *opp,
 	}
 }
 
+static void opp_debug_create_clks(struct dev_pm_opp *opp,
+				  struct opp_table *opp_table,
+				  struct dentry *pdentry)
+{
+	char name[12];
+	int i;
+
+	if (opp_table->clk_count == 1) {
+		debugfs_create_ulong("rate_hz", S_IRUGO, pdentry, &opp->rates[0]);
+		return;
+	}
+
+	for (i = 0; i < opp_table->clk_count; i++) {
+		snprintf(name, sizeof(name), "rate_hz_%d", i);
+		debugfs_create_ulong(name, S_IRUGO, pdentry, &opp->rates[i]);
+	}
+}
+
 static void opp_debug_create_supplies(struct dev_pm_opp *opp,
 				      struct opp_table *opp_table,
 				      struct dentry *pdentry)
@@ -117,10 +135,11 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	 * Get directory name for OPP.
 	 *
 	 * - Normally rate is unique to each OPP, use it to get unique opp-name.
-	 * - For some devices rate isn't available, use index instead.
+	 * - For some devices rate isn't available or there are multiple, use
+	 *   index instead for them.
 	 */
-	if (likely(opp->rate))
-		id = opp->rate;
+	if (likely(opp_table->clk_count == 1 && opp->rates[0]))
+		id = opp->rates[0];
 	else
 		id = _get_opp_count(opp_table);
 
@@ -134,7 +153,6 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo);
 	debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend);
 	debugfs_create_u32("performance_state", S_IRUGO, d, &opp->pstate);
-	debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate);
 	debugfs_create_u32("level", S_IRUGO, d, &opp->level);
 	debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
 			     &opp->clock_latency_ns);
@@ -142,6 +160,7 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	opp->of_name = of_node_full_name(opp->np);
 	debugfs_create_str("of_name", S_IRUGO, d, (char **)&opp->of_name);
 
+	opp_debug_create_clks(opp, opp_table, d);
 	opp_debug_create_supplies(opp, opp_table, d);
 	opp_debug_create_bw(opp, opp_table, d);
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index bb49057cb1fc..7607e112df3f 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -767,6 +767,50 @@ void dev_pm_opp_of_remove_table(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
+static int _read_rate(struct dev_pm_opp *new_opp, struct opp_table *opp_table,
+		      struct device_node *np)
+{
+	struct property *prop;
+	int i, count, ret;
+	u64 *rates;
+
+	prop = of_find_property(np, "opp-hz", NULL);
+	if (!prop)
+		return -ENODEV;
+
+	count = prop->length / sizeof(u64);
+	if (opp_table->clk_count != count) {
+		pr_err("%s: Count mismatch between opp-hz and clk_count (%d %d)\n",
+		       __func__, count, opp_table->clk_count);
+		return -EINVAL;
+	}
+
+	rates = kmalloc_array(count, sizeof(*rates), GFP_KERNEL);
+	if (!rates)
+		return -ENOMEM;
+
+	ret = of_property_read_u64_array(np, "opp-hz", rates, count);
+	if (ret) {
+		pr_err("%s: Error parsing opp-hz: %d\n", __func__, ret);
+	} else {
+		/*
+		 * Rate is defined as an unsigned long in clk API, and so
+		 * casting explicitly to its type. Must be fixed once rate is 64
+		 * bit guaranteed in clk API.
+		 */
+		for (i = 0; i < count; i++) {
+			new_opp->rates[i] = (unsigned long)rates[i];
+
+			/* This will happen for frequencies > 4.29 GHz */
+			WARN_ON(new_opp->rates[i] != rates[i]);
+		}
+	}
+
+	kfree(rates);
+
+	return ret;
+}
+
 static int _read_bw(struct dev_pm_opp *new_opp, struct opp_table *opp_table,
 		    struct device_node *np, bool peak)
 {
@@ -812,19 +856,13 @@ static int _read_opp_key(struct dev_pm_opp *new_opp,
 			 struct opp_table *opp_table, struct device_node *np)
 {
 	bool found = false;
-	u64 rate;
 	int ret;
 
-	ret = of_property_read_u64(np, "opp-hz", &rate);
-	if (!ret) {
-		/*
-		 * Rate is defined as an unsigned long in clk API, and so
-		 * casting explicitly to its type. Must be fixed once rate is 64
-		 * bit guaranteed in clk API.
-		 */
-		new_opp->rate = (unsigned long)rate;
+	ret = _read_rate(new_opp, opp_table, np);
+	if (!ret)
 		found = true;
-	}
+	else if (ret != -ENODEV)
+		return ret;
 
 	/*
 	 * Bandwidth consists of peak and average (optional) values:
@@ -893,8 +931,8 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 
 	/* Check if the OPP supports hardware's hierarchy of versions or not */
 	if (!_opp_is_supported(dev, opp_table, np)) {
-		dev_dbg(dev, "OPP not supported by hardware: %lu\n",
-			new_opp->rate);
+		dev_dbg(dev, "OPP not supported by hardware: %s\n",
+			of_node_full_name(np));
 		goto free_opp;
 	}
 
@@ -930,7 +968,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 	if (of_property_read_bool(np, "opp-suspend")) {
 		if (opp_table->suspend_opp) {
 			/* Pick the OPP with higher rate/bw/level as suspend OPP */
-			if (_opp_compare_key(new_opp, opp_table->suspend_opp) == 1) {
+			if (_opp_compare_key(opp_table, new_opp, opp_table->suspend_opp) == 1) {
 				opp_table->suspend_opp->suspend = false;
 				new_opp->suspend = true;
 				opp_table->suspend_opp = new_opp;
@@ -945,7 +983,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu level:%u\n",
-		 __func__, new_opp->turbo, new_opp->rate,
+		 __func__, new_opp->turbo, new_opp->rates[0],
 		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
 		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns,
 		 new_opp->level);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e481bdb59499..816009eaafee 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -79,7 +79,7 @@ struct opp_config_data {
  * @suspend:	true if suspend OPP
  * @removed:	flag indicating that OPP's reference is dropped by OPP core.
  * @pstate: Device's power domain's performance state.
- * @rate:	Frequency in hertz
+ * @rates:	Frequencies in hertz
  * @level:	Performance level
  * @supplies:	Power supplies voltage/current values
  * @bandwidth:	Interconnect bandwidth values
@@ -102,7 +102,7 @@ struct dev_pm_opp {
 	bool suspend;
 	bool removed;
 	unsigned int pstate;
-	unsigned long rate;
+	unsigned long *rates;
 	unsigned int level;
 
 	struct dev_pm_opp_supply *supplies;
@@ -170,8 +170,10 @@ enum opp_table_access {
  * @supported_hw: Array of version number to support.
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
- * @clk_configured: Clock name is configured by the platform.
- * @clk: Device's clock handle
+ * @config_clks: Platform specific config_clks() callback.
+ * @clks: Device's clock handles, for multiple clocks.
+ * @clk: Device's clock handle, for single clock.
+ * @clk_count: Number of clocks.
  * @config_regulators: Platform specific config_regulators() callback.
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
@@ -220,8 +222,10 @@ struct opp_table {
 	unsigned int *supported_hw;
 	unsigned int supported_hw_count;
 	const char *prop_name;
-	bool clk_configured;
+	config_clks_t config_clks;
+	struct clk **clks;
 	struct clk *clk;
+	int clk_count;
 	config_regulators_t config_regulators;
 	struct regulator **regulators;
 	int regulator_count;
@@ -246,7 +250,7 @@ struct opp_table *_find_opp_table(struct device *dev);
 struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
 struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
 void _opp_free(struct dev_pm_opp *opp);
-int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2);
+int _opp_compare_key(struct opp_table *opp_table, struct dev_pm_opp *opp1, struct dev_pm_opp *opp2);
 int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
 int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
 void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 50cbc75bef71..104151dfe46c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -61,9 +61,13 @@ typedef int (*config_regulators_t)(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count);
 
+typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table,
+			struct dev_pm_opp *opp, void *data, bool scaling_down);
+
 /**
  * struct dev_pm_opp_config - Device OPP configuration values
- * @clk_names: Clk names, NULL terminated array, max 1 clock for now.
+ * @clk_names: Clk names, NULL terminated array.
+ * @config_clks: Custom set clk helper.
  * @prop_name: Name to postfix to properties.
  * @config_regulators: Custom set regulator helper.
  * @supported_hw: Array of hierarchy of versions to match.
@@ -78,6 +82,7 @@ typedef int (*config_regulators_t)(struct device *dev,
 struct dev_pm_opp_config {
 	/* NULL terminated */
 	const char * const *clk_names;
+	config_clks_t config_clks;
 	const char *prop_name;
 	config_regulators_t config_regulators;
 	const unsigned int *supported_hw;
-- 
cgit v1.2.3


From 8174a3a613af1a911ab19da812824f7180b261f9 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Jun 2022 12:37:25 +0530
Subject: OPP: Provide a simple implementation to configure multiple clocks

This provides a simple implementation to configure multiple clocks for a
device.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 34 ++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h | 10 ++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 4675a00b9816..0c5b12e99d39 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -843,6 +843,40 @@ _opp_config_clk_single(struct device *dev, struct opp_table *opp_table,
 	return ret;
 }
 
+/*
+ * Simple implementation for configuring multiple clocks. Configure clocks in
+ * the order in which they are present in the array while scaling up.
+ */
+int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down)
+{
+	int ret, i;
+
+	if (scaling_down) {
+		for (i = opp_table->clk_count - 1; i >= 0; i--) {
+			ret = clk_set_rate(opp_table->clks[i], opp->rates[i]);
+			if (ret) {
+				dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+					ret);
+				return ret;
+			}
+		}
+	} else {
+		for (i = 0; i < opp_table->clk_count; i++) {
+			ret = clk_set_rate(opp_table->clks[i], opp->rates[i]);
+			if (ret) {
+				dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+					ret);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_config_clks_simple);
+
 static int _opp_config_regulator_single(struct device *dev,
 			struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp,
 			struct regulator **regulators, unsigned int count)
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 104151dfe46c..683e6baf9618 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -159,6 +159,9 @@ int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb
 int dev_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_config *config);
 void dev_pm_opp_clear_config(int token);
+int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down);
 
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
@@ -342,6 +345,13 @@ static inline int devm_pm_opp_set_config(struct device *dev, struct dev_pm_opp_c
 
 static inline void dev_pm_opp_clear_config(int token) {}
 
+static inline int dev_pm_opp_config_clks_simple(struct device *dev,
+		struct opp_table *opp_table, struct dev_pm_opp *opp, void *data,
+		bool scaling_down)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
 				struct opp_table *dst_table, struct dev_pm_opp *src_opp)
 {
-- 
cgit v1.2.3


From 1e5fb38442ebaabb65057c2e58355ee36c2a178f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 5 Jul 2022 09:41:56 +0530
Subject: OPP: Remove dev{m}_pm_opp_of_add_table_noclk()

Remove the now unused variants and the now unnecessary "getclk"
parameter from few routines.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/of.c       | 48 ++++++++----------------------------------------
 include/linux/pm_opp.h | 12 ------------
 2 files changed, 8 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 7607e112df3f..8367823a2001 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1120,7 +1120,7 @@ remove_static_opp:
 	return ret;
 }
 
-static int _of_add_table_indexed(struct device *dev, int index, bool getclk)
+static int _of_add_table_indexed(struct device *dev, int index)
 {
 	struct opp_table *opp_table;
 	int ret, count;
@@ -1136,7 +1136,7 @@ static int _of_add_table_indexed(struct device *dev, int index, bool getclk)
 			index = 0;
 	}
 
-	opp_table = _add_opp_table_indexed(dev, index, getclk);
+	opp_table = _add_opp_table_indexed(dev, index, true);
 	if (IS_ERR(opp_table))
 		return PTR_ERR(opp_table);
 
@@ -1160,11 +1160,11 @@ static void devm_pm_opp_of_table_release(void *data)
 	dev_pm_opp_of_remove_table(data);
 }
 
-static int _devm_of_add_table_indexed(struct device *dev, int index, bool getclk)
+static int _devm_of_add_table_indexed(struct device *dev, int index)
 {
 	int ret;
 
-	ret = _of_add_table_indexed(dev, index, getclk);
+	ret = _of_add_table_indexed(dev, index);
 	if (ret)
 		return ret;
 
@@ -1192,7 +1192,7 @@ static int _devm_of_add_table_indexed(struct device *dev, int index, bool getclk
  */
 int devm_pm_opp_of_add_table(struct device *dev)
 {
-	return _devm_of_add_table_indexed(dev, 0, true);
+	return _devm_of_add_table_indexed(dev, 0);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table);
 
@@ -1215,7 +1215,7 @@ EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table);
  */
 int dev_pm_opp_of_add_table(struct device *dev)
 {
-	return _of_add_table_indexed(dev, 0, true);
+	return _of_add_table_indexed(dev, 0);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
 
@@ -1231,7 +1231,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
  */
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index)
 {
-	return _of_add_table_indexed(dev, index, true);
+	return _of_add_table_indexed(dev, index);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
 
@@ -1244,42 +1244,10 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed);
  */
 int devm_pm_opp_of_add_table_indexed(struct device *dev, int index)
 {
-	return _devm_of_add_table_indexed(dev, index, true);
+	return _devm_of_add_table_indexed(dev, index);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table_indexed);
 
-/**
- * dev_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device
- *		tree without getting clk for device.
- * @dev:	device pointer used to lookup OPP table.
- * @index:	Index number.
- *
- * Register the initial OPP table with the OPP library for given device only
- * using the "operating-points-v2" property. Do not try to get the clk for the
- * device.
- *
- * Return: Refer to dev_pm_opp_of_add_table() for return values.
- */
-int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return _of_add_table_indexed(dev, index, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_noclk);
-
-/**
- * devm_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device
- *		tree without getting clk for device.
- * @dev:	device pointer used to lookup OPP table.
- * @index:	Index number.
- *
- * This is a resource-managed variant of dev_pm_opp_of_add_table_noclk().
- */
-int devm_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return _devm_of_add_table_indexed(dev, index, false);
-}
-EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table_noclk);
-
 /* CPU device specific helpers */
 
 /**
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 683e6baf9618..dc1fb5890792 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -402,8 +402,6 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev)
 int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
 int devm_pm_opp_of_add_table_indexed(struct device *dev, int index);
-int dev_pm_opp_of_add_table_noclk(struct device *dev, int index);
-int devm_pm_opp_of_add_table_noclk(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
 int devm_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
@@ -434,16 +432,6 @@ static inline int devm_pm_opp_of_add_table_indexed(struct device *dev, int index
 	return -EOPNOTSUPP;
 }
 
-static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int devm_pm_opp_of_add_table_noclk(struct device *dev, int index)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
-- 
cgit v1.2.3


From 3beb0ab5bffba625007ea5c9e5e0ee5eef05c1ea Mon Sep 17 00:00:00 2001
From: Seunghui Lee <sh043.lee@samsung.com>
Date: Wed, 13 Jul 2022 12:36:34 +0900
Subject: mmc: core: Use mmc_card_* macro and add a new for the sd_combo type

Add mmc_card_sd_combo() macro for sd combo type card and use the mmc_card_*
macro to simplify code instead of comparing card->type.

Signed-off-by: Seunghui Lee <sh043.lee@samsung.com>
Link: https://lore.kernel.org/r/20220713033635.28432-2-sh043.lee@samsung.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c |  4 ++--
 drivers/mmc/core/bus.c   |  4 ++--
 drivers/mmc/core/sd.c    |  2 +-
 drivers/mmc/core/sdio.c  | 16 ++++++++--------
 include/linux/mmc/card.h |  1 +
 5 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index f4a1281658db..9c642b3b7c20 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2988,7 +2988,7 @@ static int mmc_blk_probe(struct mmc_card *card)
 	 * Don't enable runtime PM for SD-combo cards here. Leave that
 	 * decision to be taken during the SDIO init sequence instead.
 	 */
-	if (card->type != MMC_TYPE_SD_COMBO) {
+	if (!mmc_card_sd_combo(card)) {
 		pm_runtime_set_active(&card->dev);
 		pm_runtime_enable(&card->dev);
 	}
@@ -3015,7 +3015,7 @@ static void mmc_blk_remove(struct mmc_card *card)
 		mmc_blk_part_switch(card, md->part_type);
 		mmc_release_host(card->host);
 	}
-	if (card->type != MMC_TYPE_SD_COMBO)
+	if (!mmc_card_sd_combo(card))
 		pm_runtime_disable(&card->dev);
 	pm_runtime_put_noidle(&card->dev);
 	mmc_blk_remove_req(md);
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index 58a60afa650b..d8762fa3d5cd 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -85,7 +85,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 			return retval;
 	}
 
-	if (card->type == MMC_TYPE_SDIO || card->type == MMC_TYPE_SD_COMBO) {
+	if (mmc_card_sdio(card) || mmc_card_sd_combo(card)) {
 		retval = add_uevent_var(env, "SDIO_ID=%04X:%04X",
 					card->cis.vendor, card->cis.device);
 		if (retval)
@@ -107,7 +107,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 	 * SDIO (non-combo) cards are not handled by mmc_block driver and do not
 	 * have accessible CID register which used by mmc_card_name() function.
 	 */
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		return 0;
 
 	retval = add_uevent_var(env, "MMC_NAME=%s", mmc_card_name(card));
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index d2023837dd72..cee4c0b59f43 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -793,7 +793,7 @@ static umode_t sd_std_is_visible(struct kobject *kobj, struct attribute *attr,
 	     attr == &dev_attr_info2.attr ||
 	     attr == &dev_attr_info3.attr ||
 	     attr == &dev_attr_info4.attr
-	    ) && card->type != MMC_TYPE_SD_COMBO)
+	    ) &&!mmc_card_sd_combo(card))
 		return 0;
 
 	return attr->mode;
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 25799accf8a0..b589df1c35e0 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -335,7 +335,7 @@ static int sdio_disable_4bit_bus(struct mmc_card *card)
 {
 	int err;
 
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		goto out;
 
 	if (!(card->host->caps & MMC_CAP_4_BIT_DATA))
@@ -360,7 +360,7 @@ static int sdio_enable_4bit_bus(struct mmc_card *card)
 	err = sdio_enable_wide(card);
 	if (err <= 0)
 		return err;
-	if (card->type == MMC_TYPE_SDIO)
+	if (mmc_card_sdio(card))
 		goto out;
 
 	if (card->scr.bus_widths & SD_SCR_BUS_WIDTH_4) {
@@ -415,7 +415,7 @@ static int sdio_enable_hs(struct mmc_card *card)
 	int ret;
 
 	ret = mmc_sdio_switch_hs(card, true);
-	if (ret <= 0 || card->type == MMC_TYPE_SDIO)
+	if (ret <= 0 || mmc_card_sdio(card))
 		return ret;
 
 	ret = mmc_sd_switch_hs(card);
@@ -441,7 +441,7 @@ static unsigned mmc_sdio_get_max_clock(struct mmc_card *card)
 		max_dtr = card->cis.max_dtr;
 	}
 
-	if (card->type == MMC_TYPE_SD_COMBO)
+	if (mmc_card_sd_combo(card))
 		max_dtr = min(max_dtr, mmc_sd_get_max_clock(card));
 
 	return max_dtr;
@@ -689,7 +689,7 @@ try_again:
 	    mmc_sd_get_cid(host, ocr & rocr, card->raw_cid, NULL) == 0) {
 		card->type = MMC_TYPE_SD_COMBO;
 
-		if (oldcard && (oldcard->type != MMC_TYPE_SD_COMBO ||
+		if (oldcard && (!mmc_card_sd_combo(oldcard) ||
 		    memcmp(card->raw_cid, oldcard->raw_cid, sizeof(card->raw_cid)) != 0)) {
 			err = -ENOENT;
 			goto mismatch;
@@ -697,7 +697,7 @@ try_again:
 	} else {
 		card->type = MMC_TYPE_SDIO;
 
-		if (oldcard && oldcard->type != MMC_TYPE_SDIO) {
+		if (oldcard && !mmc_card_sdio(oldcard)) {
 			err = -ENOENT;
 			goto mismatch;
 		}
@@ -754,7 +754,7 @@ try_again:
 	/*
 	 * Read CSD, before selecting the card
 	 */
-	if (!oldcard && card->type == MMC_TYPE_SD_COMBO) {
+	if (!oldcard && mmc_card_sd_combo(card)) {
 		err = mmc_sd_get_csd(card);
 		if (err)
 			goto remove;
@@ -827,7 +827,7 @@ try_again:
 
 	mmc_fixup_device(card, sdio_fixup_methods);
 
-	if (card->type == MMC_TYPE_SD_COMBO) {
+	if (mmc_card_sd_combo(card)) {
 		err = mmc_sd_setup_card(host, card, oldcard != NULL);
 		/* handle as SDIO-only card if memory init failed */
 		if (err) {
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 37f975875102..156a7b673a28 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -348,5 +348,6 @@ bool mmc_card_is_blockaddr(struct mmc_card *card);
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
 #define mmc_card_sd(c)		((c)->type == MMC_TYPE_SD)
 #define mmc_card_sdio(c)	((c)->type == MMC_TYPE_SDIO)
+#define mmc_card_sd_combo(c)	((c)->type == MMC_TYPE_SD_COMBO)
 
 #endif /* LINUX_MMC_CARD_H */
-- 
cgit v1.2.3


From 20347fca71a387a3751f7bb270062616ddc5317a Mon Sep 17 00:00:00 2001
From: Tianyu Lan <Tianyu.Lan@microsoft.com>
Date: Fri, 8 Jul 2022 12:15:44 -0400
Subject: swiotlb: split up the global swiotlb lock

Traditionally swiotlb was not performance critical because it was only
used for slow devices. But in some setups, like TDX/SEV confidential
guests, all IO has to go through swiotlb. Currently swiotlb only has a
single lock. Under high IO load with multiple CPUs this can lead to
significat lock contention on the swiotlb lock.

This patch splits the swiotlb bounce buffer pool into individual areas
which have their own lock. Each CPU tries to allocate in its own area
first. Only if that fails does it search other areas. On freeing the
allocation is freed into the area where the memory was originally
allocated from.

Area number can be set via swiotlb kernel parameter and is default
to be possible cpu number. If possible cpu number is not power of
2, area number will be round up to the next power of 2.

This idea from Andi Kleen patch(https://github.com/intel/tdx/commit/
4529b5784c141782c72ec9bd9a92df2b68cb7d45).

Based-on-idea-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/admin-guide/kernel-parameters.txt |   4 +-
 include/linux/swiotlb.h                         |   5 +
 kernel/dma/swiotlb.c                            | 229 +++++++++++++++++++-----
 3 files changed, 197 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2522b11e593f..4a6ad177d4b8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5904,8 +5904,10 @@
 			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
-			Format: { <int> | force | noforce }
+			Format: { <int> [,<int>] | force | noforce }
 			<int> -- Number of I/O TLB slabs
+			<int> -- Second integer after comma. Number of swiotlb
+				 areas with their own lock. Must be power of 2.
 			force -- force using of bounce buffers even if they
 			         wouldn't be automatically used by the kernel
 			noforce -- Never use bounce buffers (for debugging)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index bdc58a0e20b1..f65ff1930120 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -88,6 +88,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  * @late_alloc:	%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
  * @for_alloc:  %true if the pool is used for memory allocation
+ * @nareas:  The area number in the pool.
+ * @area_nslabs: The slot number in the area.
  */
 struct io_tlb_mem {
 	phys_addr_t start;
@@ -101,6 +103,9 @@ struct io_tlb_mem {
 	bool late_alloc;
 	bool force_bounce;
 	bool for_alloc;
+	unsigned int nareas;
+	unsigned int area_nslabs;
+	struct io_tlb_area *areas;
 	struct io_tlb_slot {
 		phys_addr_t orig_addr;
 		size_t alloc_size;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 909b43445574..dcf1459ce723 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -70,6 +70,43 @@ struct io_tlb_mem io_tlb_default_mem;
 phys_addr_t swiotlb_unencrypted_base;
 
 static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used:	The number of used IO TLB block.
+ * @index:	The slot index to start searching in this area for next round.
+ * @lock:	The lock to protect the above data structures in the map and
+ *		unmap calls.
+ */
+struct io_tlb_area {
+	unsigned long used;
+	unsigned int index;
+	spinlock_t lock;
+};
+
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+	if (!is_power_of_2(nareas))
+		nareas = roundup_pow_of_two(nareas);
+
+	default_nareas = nareas;
+
+	pr_info("area num %d.\n", nareas);
+	/*
+	 * Round up number of slabs to the next power of 2.
+	 * The last area is going be smaller than the rest if
+	 * default_nslabs is not power of two.
+	 */
+	if (nareas && !is_power_of_2(default_nslabs)) {
+		default_nslabs = roundup_pow_of_two(default_nslabs);
+		pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+			(default_nslabs << IO_TLB_SHIFT) >> 20);
+	}
+}
 
 static int __init
 setup_io_tlb_npages(char *str)
@@ -79,6 +116,10 @@ setup_io_tlb_npages(char *str)
 		default_nslabs =
 			ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
 	}
+	if (*str == ',')
+		++str;
+	if (isdigit(*str))
+		swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
 	if (*str == ',')
 		++str;
 	if (!strcmp(str, "force"))
@@ -112,8 +153,19 @@ void __init swiotlb_adjust_size(unsigned long size)
 	 */
 	if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
 		return;
+
+	/*
+	 * Round up number of slabs to the next power of 2.
+	 * The last area is going be smaller than the rest if
+	 * default_nslabs is not power of two.
+	 */
 	size = ALIGN(size, IO_TLB_SIZE);
 	default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	if (default_nareas) {
+		default_nslabs = roundup_pow_of_two(default_nslabs);
+		size = default_nslabs << IO_TLB_SHIFT;
+	}
+
 	pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 }
 
@@ -192,7 +244,8 @@ void __init swiotlb_update_mem_attributes(void)
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
-		unsigned long nslabs, unsigned int flags, bool late_alloc)
+		unsigned long nslabs, unsigned int flags,
+		bool late_alloc, unsigned int nareas)
 {
 	void *vaddr = phys_to_virt(start);
 	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
@@ -202,10 +255,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->end = mem->start + bytes;
 	mem->index = 0;
 	mem->late_alloc = late_alloc;
+	mem->nareas = nareas;
+	mem->area_nslabs = nslabs / mem->nareas;
 
 	mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
 
 	spin_lock_init(&mem->lock);
+	for (i = 0; i < mem->nareas; i++) {
+		spin_lock_init(&mem->areas[i].lock);
+		mem->areas[i].index = 0;
+	}
+
 	for (i = 0; i < mem->nslabs; i++) {
 		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
@@ -232,7 +292,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 		int (*remap)(void *tlb, unsigned long nslabs))
 {
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long nslabs = default_nslabs;
+	unsigned long nslabs;
 	size_t alloc_size;
 	size_t bytes;
 	void *tlb;
@@ -242,6 +302,14 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 	if (swiotlb_force_disable)
 		return;
 
+	/*
+	 * default_nslabs maybe changed when adjust area number.
+	 * So allocate bounce buffer after adjusting area number.
+	 */
+	if (!default_nareas)
+		swiotlb_adjust_nareas(num_possible_cpus());
+
+	nslabs = default_nslabs;
 	if (nslabs < IO_TLB_MIN_SLABS)
 		panic("%s: nslabs = %lu too small\n", __func__, nslabs);
 
@@ -278,7 +346,13 @@ retry:
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false);
+	mem->areas = memblock_alloc(sizeof(struct io_tlb_area) *
+		default_nareas, SMP_CACHE_BYTES);
+	if (!mem->areas)
+		panic("%s: Failed to allocate mem->areas.\n", __func__);
+
+	swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
+				default_nareas);
 
 	if (flags & SWIOTLB_VERBOSE)
 		swiotlb_print_info();
@@ -300,7 +374,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
 	unsigned char *vstart = NULL;
-	unsigned int order;
+	unsigned int order, area_order;
 	bool retried = false;
 	int rc = 0;
 
@@ -341,19 +415,34 @@ retry:
 			(PAGE_SIZE << order) >> 20);
 	}
 
+	if (!default_nareas)
+		swiotlb_adjust_nareas(num_possible_cpus());
+
+	area_order = get_order(array_size(sizeof(*mem->areas),
+		default_nareas));
+	mem->areas = (struct io_tlb_area *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+	if (!mem->areas)
+		goto error_area;
+
 	mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 		get_order(array_size(sizeof(*mem->slots), nslabs)));
-	if (!mem->slots) {
-		free_pages((unsigned long)vstart, order);
-		return -ENOMEM;
-	}
+	if (!mem->slots)
+		goto error_slots;
 
 	set_memory_decrypted((unsigned long)vstart,
 			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
-	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true);
+	swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
+				default_nareas);
 
 	swiotlb_print_info();
 	return 0;
+
+error_slots:
+	free_pages((unsigned long)mem->areas, area_order);
+error_area:
+	free_pages((unsigned long)vstart, order);
+	return -ENOMEM;
 }
 
 void __init swiotlb_exit(void)
@@ -361,6 +450,7 @@ void __init swiotlb_exit(void)
 	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long tbl_vaddr;
 	size_t tbl_size, slots_size;
+	unsigned int area_order;
 
 	if (swiotlb_force_bounce)
 		return;
@@ -375,9 +465,14 @@ void __init swiotlb_exit(void)
 
 	set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
 	if (mem->late_alloc) {
+		area_order = get_order(array_size(sizeof(*mem->areas),
+			mem->nareas));
+		free_pages((unsigned long)mem->areas, area_order);
 		free_pages(tbl_vaddr, get_order(tbl_size));
 		free_pages((unsigned long)mem->slots, get_order(slots_size));
 	} else {
+		memblock_free_late(__pa(mem->areas),
+				   mem->nareas * sizeof(struct io_tlb_area));
 		memblock_free_late(mem->start, tbl_size);
 		memblock_free_late(__pa(mem->slots), slots_size);
 	}
@@ -480,9 +575,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
 	return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
 {
-	if (index >= mem->nslabs)
+	if (index >= mem->area_nslabs)
 		return 0;
 	return index;
 }
@@ -491,10 +586,11 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
  * Find a suitable number of IO TLB entries size that will fit this request and
  * allocate a buffer from that IO TLB pool.
  */
-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
-			      size_t alloc_size, unsigned int alloc_align_mask)
+static int swiotlb_do_find_slots(struct io_tlb_mem *mem,
+		struct io_tlb_area *area, int area_index,
+		struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size, unsigned int alloc_align_mask)
 {
-	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
 		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -505,8 +601,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	unsigned int index, wrap, count = 0, i;
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned long flags;
+	unsigned int slot_base;
+	unsigned int slot_index;
 
 	BUG_ON(!nslots);
+	BUG_ON(area_index >= mem->nareas);
 
 	/*
 	 * For mappings with an alignment requirement don't bother looping to
@@ -518,16 +617,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 		stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 	stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
 
-	spin_lock_irqsave(&mem->lock, flags);
-	if (unlikely(nslots > mem->nslabs - mem->used))
+	spin_lock_irqsave(&area->lock, flags);
+	if (unlikely(nslots > mem->area_nslabs - area->used))
 		goto not_found;
 
-	index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
+	slot_base = area_index * mem->area_nslabs;
+	index = wrap = wrap_area_index(mem, ALIGN(area->index, stride));
+
 	do {
+		slot_index = slot_base + index;
+
 		if (orig_addr &&
-		    (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
-			    (orig_addr & iotlb_align_mask)) {
-			index = wrap_index(mem, index + 1);
+		    (slot_addr(tbl_dma_addr, slot_index) &
+		     iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
+			index = wrap_area_index(mem, index + 1);
 			continue;
 		}
 
@@ -536,26 +639,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 		 * contiguous buffers, we allocate the buffers from that slot
 		 * and mark the entries as '0' indicating unavailable.
 		 */
-		if (!iommu_is_span_boundary(index, nslots,
+		if (!iommu_is_span_boundary(slot_index, nslots,
 					    nr_slots(tbl_dma_addr),
 					    max_slots)) {
-			if (mem->slots[index].list >= nslots)
+			if (mem->slots[slot_index].list >= nslots)
 				goto found;
 		}
-		index = wrap_index(mem, index + stride);
+		index = wrap_area_index(mem, index + stride);
 	} while (index != wrap);
 
 not_found:
-	spin_unlock_irqrestore(&mem->lock, flags);
+	spin_unlock_irqrestore(&area->lock, flags);
 	return -1;
 
 found:
-	for (i = index; i < index + nslots; i++) {
+	for (i = slot_index; i < slot_index + nslots; i++) {
 		mem->slots[i].list = 0;
-		mem->slots[i].alloc_size =
-			alloc_size - (offset + ((i - index) << IO_TLB_SHIFT));
+		mem->slots[i].alloc_size = alloc_size - (offset +
+				((i - slot_index) << IO_TLB_SHIFT));
 	}
-	for (i = index - 1;
+	for (i = slot_index - 1;
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
 	     mem->slots[i].list; i--)
 		mem->slots[i].list = ++count;
@@ -563,14 +666,43 @@ found:
 	/*
 	 * Update the indices to avoid searching in the next round.
 	 */
-	if (index + nslots < mem->nslabs)
-		mem->index = index + nslots;
+	if (index + nslots < mem->area_nslabs)
+		area->index = index + nslots;
 	else
-		mem->index = 0;
-	mem->used += nslots;
+		area->index = 0;
+	area->used += nslots;
+	spin_unlock_irqrestore(&area->lock, flags);
+	return slot_index;
+}
 
-	spin_unlock_irqrestore(&mem->lock, flags);
-	return index;
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+		size_t alloc_size, unsigned int alloc_align_mask)
+{
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+	int start = raw_smp_processor_id() & ((1U << __fls(mem->nareas)) - 1);
+	int i = start, index;
+
+	do {
+		index = swiotlb_do_find_slots(mem, mem->areas + i, i,
+					      dev, orig_addr, alloc_size,
+					      alloc_align_mask);
+		if (index >= 0)
+			return index;
+		if (++i >= mem->nareas)
+			i = 0;
+	} while (i != start);
+
+	return -1;
+}
+
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+	int i;
+	unsigned long used = 0;
+
+	for (i = 0; i < mem->nareas; i++)
+		used += mem->areas[i].used;
+	return used;
 }
 
 phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
@@ -602,7 +734,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
 	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-				 alloc_size, mem->nslabs, mem->used);
+				 alloc_size, mem->nslabs, mem_used(mem));
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
@@ -632,6 +764,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
 	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
 	int nslots = nr_slots(mem->slots[index].alloc_size + offset);
+	int aindex = index / mem->area_nslabs;
+	struct io_tlb_area *area = &mem->areas[aindex];
 	int count, i;
 
 	/*
@@ -640,7 +774,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	 * While returning the entries to the free list, we merge the entries
 	 * with slots below and above the pool being returned.
 	 */
-	spin_lock_irqsave(&mem->lock, flags);
+	BUG_ON(aindex >= mem->nareas);
+
+	spin_lock_irqsave(&area->lock, flags);
 	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
 		count = mem->slots[index + nslots].list;
 	else
@@ -664,8 +800,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
 	     i--)
 		mem->slots[i].list = ++count;
-	mem->used -= nslots;
-	spin_unlock_irqrestore(&mem->lock, flags);
+	area->used -= nslots;
+	spin_unlock_irqrestore(&area->lock, flags);
 }
 
 /*
@@ -763,12 +899,14 @@ EXPORT_SYMBOL_GPL(is_swiotlb_active);
 static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
 					 const char *dirname)
 {
+	unsigned long used = mem_used(mem);
+
 	mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
 	if (!mem->nslabs)
 		return;
 
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
-	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
+	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &used);
 }
 
 static int __init __maybe_unused swiotlb_create_default_debugfs(void)
@@ -819,6 +957,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 	struct io_tlb_mem *mem = rmem->priv;
 	unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
 
+	/* Set Per-device io tlb area to one */
+	unsigned int nareas = 1;
+
 	/*
 	 * Since multiple devices can share the same pool, the private data,
 	 * io_tlb_mem struct, will be initialized by the first device attached
@@ -835,10 +976,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 			return -ENOMEM;
 		}
 
+		mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+				GFP_KERNEL);
+		if (!mem->areas) {
+			kfree(mem);
+			kfree(mem->slots);
+			return -ENOMEM;
+		}
+
 		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
 				     rmem->size >> PAGE_SHIFT);
 		swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
-				false);
+					false, nareas);
 		mem->for_alloc = true;
 
 		rmem->priv = mem;
-- 
cgit v1.2.3


From 6d1c1a73e1126572de0a8b063fe62fe43786ed59 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 8 Jul 2022 14:13:11 +0800
Subject: soundwire: Intel: add trigger callback

When a pipeline is split into FE and BE parts, the BE pipeline may need to
be triggered separately in the BE trigger op. So add the trigger callback
in the link_res ops that will be invoked during BE DAI trigger.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-by: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20220708061312.25878-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/intel.c           | 9 +++++++++
 include/linux/soundwire/sdw_intel.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 0268fa527c0c..fed6418d6375 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -1004,9 +1004,18 @@ static int intel_trigger(struct snd_pcm_substream *substream, int cmd, struct sn
 {
 	struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai);
 	struct sdw_intel *sdw = cdns_to_intel(cdns);
+	struct sdw_intel_link_res *res = sdw->link_res;
 	struct sdw_cdns_dma_data *dma;
 	int ret = 0;
 
+	/*
+	 * The .trigger callback is used to send required IPC to audio
+	 * firmware. The .free_stream callback will still be called
+	 * by intel_free_stream() in the TRIGGER_SUSPEND case.
+	 */
+	if (res->ops && res->ops->trigger)
+		res->ops->trigger(dai, cmd, substream->stream);
+
 	dma = snd_soc_dai_get_dma_data(dai, substream);
 	if (!dma) {
 		dev_err(dai->dev, "failed to get dma data in %s\n",
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index b5b489ea1aef..ec16ae49e6a4 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -121,6 +121,7 @@ struct sdw_intel_ops {
 			     struct sdw_intel_stream_params_data *params_data);
 	int (*free_stream)(struct device *dev,
 			   struct sdw_intel_stream_free_data *free_data);
+	int (*trigger)(struct snd_soc_dai *dai, int cmd, int stream);
 };
 
 /**
-- 
cgit v1.2.3


From 9745fb07474f4501eff62130a78a42a8b8c18b05 Mon Sep 17 00:00:00 2001
From: Jonathan Yong <jonathan.yong@intel.com>
Date: Mon, 6 Jun 2022 19:41:27 +0300
Subject: platform/x86/intel: Add Primary to Sideband (P2SB) bridge support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SoC features such as GPIO are accessed via a reserved MMIO area,
we don't know its address but can obtain it from the BAR of
the P2SB device, that device is normally hidden so we have to
temporarily unhide it, read address and hide it back.

There are already a few users and at least one more is coming which
require an access to Primary to Sideband (P2SB) bridge in order
to get IO or MMIO BAR hidden by BIOS.

Create a library to access P2SB for x86 devices in a unified way.

Background information
======================
Note, the term "bridge" is used in the documentation and it has nothing
to do with a PCI (host) bridge as per the PCI specifications.

The P2SB is an interesting device by its nature and hardware design.
First of all, it has several devices in the hardware behind it. These
devices may or may not be represented as ACPI devices by a firmware.

It also has a hardwired (to 0s) the least significant bits of the
base address register which is represented by the only 64-bit BAR0.
It means that OS mustn't reallocate the BAR.

On top of that in some cases P2SB is represented by function 0 on PCI
slot (in terms of B:D.F) and according to the PCI specification any
other function can't be seen until function 0 is present and visible.

In the PCI configuration space of P2SB device the full 32-bit register
is allocated for the only purpose of hiding the entire P2SB device. As
per [3]:

  3.1.39 P2SB Control (P2SBC)—Offset E0h

  Hide Device (HIDE): When this bit is set, the P2SB will return 1s on
  any PCI Configuration Read on IOSF-P. All other transactions including
  PCI Configuration Writes on IOSF-P are unaffected by this. This does
  not affect reads performed on the IOSF-SB interface.

This doesn't prevent MMIO accesses, although preventing the OS from
assigning these addresses. The firmware on the affected platforms marks
the region as unusable (by cutting it off from the PCI host bridge
resources) as depicted in the Apollo Lake example below:

  PCI host bridge to bus 0000:00
  pci_bus 0000:00: root bus resource [io  0x0070-0x0077]
  pci_bus 0000:00: root bus resource [io  0x0000-0x006f window]
  pci_bus 0000:00: root bus resource [io  0x0078-0x0cf7 window]
  pci_bus 0000:00: root bus resource [io  0x0d00-0xffff window]
  pci_bus 0000:00: root bus resource [mem 0x7c000001-0x7fffffff window]
  pci_bus 0000:00: root bus resource [mem 0x7b800001-0x7bffffff window]
  pci_bus 0000:00: root bus resource [mem 0x80000000-0xcfffffff window]
  pci_bus 0000:00: root bus resource [mem 0xe0000000-0xefffffff window]
  pci_bus 0000:00: root bus resource [bus 00-ff]

The P2SB 16MB BAR is located at 0xd0000000-0xd0ffffff memory window.

The generic solution
====================
The generic solution for all cases when we need to access to the information
behind P2SB device is a library code where users ask for necessary resources
by demand and hence those users take care of not being run on the systems
where this access is not required.

The library provides the p2sb_bar() API to retrieve the MMIO of the BAR0 of
the device from P2SB device slot.

P2SB unconditional unhiding awareness
=====================================
Technically it's possible to unhide the P2SB device and devices on
the same PCI slot and access them at any time as needed. But there are
several potential issues with that:

 - the systems were never tested against such configuration and hence
   nobody knows what kind of bugs it may bring, especially when we talk
   about SPI NOR case which contains Intel FirmWare Image (IFWI) code
   (including BIOS) and already known to be problematic in the past for
   end users

 - the PCI by its nature is a hotpluggable bus and in case somebody
   attaches a driver to the functions of a P2SB slot device(s) the
   end user experience and system behaviour can be unpredictable

 - the kernel code would need some ugly hacks (or code looking as an
   ugly hack) under arch/x86/pci in order to enable these devices on
   only selected platforms (which may include CPU ID table followed by
   a potentially growing number of DMI strings

The future improvements
=======================
The future improvements with this code may go in order to gain some kind
of cache, if it's possible at all, to prevent unhiding and hiding many
times to take static information that may be saved once per boot.

Links
=====
[1]: https://lab.whitequark.org/notes/2017-11-08/accessing-intel-ich-pch-gpios/
[2]: https://cdrdv2.intel.com/v1/dl/getContent/332690?wapkw=332690
[3]: https://cdrdv2.intel.com/v1/dl/getContent/332691?wapkw=332691
[4]: https://medium.com/@jacksonchen_43335/bios-gpio-p2sb-70e9b829b403

Signed-off-by: Jonathan Yong <jonathan.yong@intel.com>
Co-developed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Henning Schild <henning.schild@siemens.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/platform/x86/intel/Kconfig     |  12 ++++
 drivers/platform/x86/intel/Makefile    |   2 +
 drivers/platform/x86/intel/p2sb.c      | 127 +++++++++++++++++++++++++++++++++
 include/linux/platform_data/x86/p2sb.h |  28 ++++++++
 4 files changed, 169 insertions(+)
 create mode 100644 drivers/platform/x86/intel/p2sb.c
 create mode 100644 include/linux/platform_data/x86/p2sb.h

(limited to 'include/linux')

diff --git a/drivers/platform/x86/intel/Kconfig b/drivers/platform/x86/intel/Kconfig
index 794968bda115..c9cfbaae436b 100644
--- a/drivers/platform/x86/intel/Kconfig
+++ b/drivers/platform/x86/intel/Kconfig
@@ -70,6 +70,18 @@ config INTEL_OAKTRAIL
 	  enable/disable the Camera, WiFi, BT etc. devices. If in doubt, say Y
 	  here; it will only load on supported platforms.
 
+config P2SB
+	bool "Primary to Sideband (P2SB) bridge access support"
+	depends on PCI
+	help
+	  The Primary to Sideband (P2SB) bridge is an interface to some
+	  PCI devices connected through it. In particular, SPI NOR controller
+	  in Intel Apollo Lake SoC is one of such devices.
+
+	  The main purpose of this library is to unhide P2SB device in case
+	  firmware kept it hidden on some platforms in order to access devices
+	  behind it.
+
 config INTEL_BXTWC_PMIC_TMU
 	tristate "Intel Broxton Whiskey Cove TMU Driver"
 	depends on INTEL_SOC_PMIC_BXTWC
diff --git a/drivers/platform/x86/intel/Makefile b/drivers/platform/x86/intel/Makefile
index 717933dd0cfd..741a9404db98 100644
--- a/drivers/platform/x86/intel/Makefile
+++ b/drivers/platform/x86/intel/Makefile
@@ -28,6 +28,8 @@ intel_int0002_vgpio-y			:= int0002_vgpio.o
 obj-$(CONFIG_INTEL_INT0002_VGPIO)	+= intel_int0002_vgpio.o
 intel_oaktrail-y			:= oaktrail.o
 obj-$(CONFIG_INTEL_OAKTRAIL)		+= intel_oaktrail.o
+intel_p2sb-y				:= p2sb.o
+obj-$(CONFIG_P2SB)			+= intel_p2sb.o
 intel_sdsi-y				:= sdsi.o
 obj-$(CONFIG_INTEL_SDSI)		+= intel_sdsi.o
 intel_vsec-y				:= vsec.o
diff --git a/drivers/platform/x86/intel/p2sb.c b/drivers/platform/x86/intel/p2sb.c
new file mode 100644
index 000000000000..b598ef14dbc6
--- /dev/null
+++ b/drivers/platform/x86/intel/p2sb.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Primary to Sideband (P2SB) bridge access support
+ *
+ * Copyright (c) 2017, 2021-2022 Intel Corporation.
+ *
+ * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *	    Jonathan Yong <jonathan.yong@intel.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/platform_data/x86/p2sb.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define P2SBC			0xe0
+#define P2SBC_HIDE		BIT(8)
+
+static const struct x86_cpu_id p2sb_cpu_ids[] = {
+	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,	PCI_DEVFN(13, 0)),
+	{}
+};
+
+static int p2sb_get_devfn(unsigned int *devfn)
+{
+	const struct x86_cpu_id *id;
+
+	id = x86_match_cpu(p2sb_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	*devfn = (unsigned int)id->driver_data;
+	return 0;
+}
+
+static int p2sb_read_bar0(struct pci_dev *pdev, struct resource *mem)
+{
+	/* Copy resource from the first BAR of the device in question */
+	*mem = pdev->resource[0];
+	return 0;
+}
+
+static int p2sb_scan_and_read(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	struct pci_dev *pdev;
+	int ret;
+
+	pdev = pci_scan_single_device(bus, devfn);
+	if (!pdev)
+		return -ENODEV;
+
+	ret = p2sb_read_bar0(pdev, mem);
+
+	pci_stop_and_remove_bus_device(pdev);
+	return ret;
+}
+
+/**
+ * p2sb_bar - Get Primary to Sideband (P2SB) bridge device BAR
+ * @bus: PCI bus to communicate with
+ * @devfn: PCI slot and function to communicate with
+ * @mem: memory resource to be filled in
+ *
+ * The BIOS prevents the P2SB device from being enumerated by the PCI
+ * subsystem, so we need to unhide and hide it back to lookup the BAR.
+ *
+ * if @bus is NULL, the bus 0 in domain 0 will be used.
+ * If @devfn is 0, it will be replaced by devfn of the P2SB device.
+ *
+ * Caller must provide a valid pointer to @mem.
+ *
+ * Locking is handled by pci_rescan_remove_lock mutex.
+ *
+ * Return:
+ * 0 on success or appropriate errno value on error.
+ */
+int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	struct pci_dev *pdev_p2sb;
+	unsigned int devfn_p2sb;
+	u32 value = P2SBC_HIDE;
+	int ret;
+
+	/* Get devfn for P2SB device itself */
+	ret = p2sb_get_devfn(&devfn_p2sb);
+	if (ret)
+		return ret;
+
+	/* if @bus is NULL, use bus 0 in domain 0 */
+	bus = bus ?: pci_find_bus(0, 0);
+
+	/*
+	 * Prevent concurrent PCI bus scan from seeing the P2SB device and
+	 * removing via sysfs while it is temporarily exposed.
+	 */
+	pci_lock_rescan_remove();
+
+	/* Unhide the P2SB device, if needed */
+	pci_bus_read_config_dword(bus, devfn_p2sb, P2SBC, &value);
+	if (value & P2SBC_HIDE)
+		pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, 0);
+
+	pdev_p2sb = pci_scan_single_device(bus, devfn_p2sb);
+	if (devfn)
+		ret = p2sb_scan_and_read(bus, devfn, mem);
+	else
+		ret = p2sb_read_bar0(pdev_p2sb, mem);
+	pci_stop_and_remove_bus_device(pdev_p2sb);
+
+	/* Hide the P2SB device, if it was hidden */
+	if (value & P2SBC_HIDE)
+		pci_bus_write_config_dword(bus, devfn_p2sb, P2SBC, P2SBC_HIDE);
+
+	pci_unlock_rescan_remove();
+
+	if (ret)
+		return ret;
+
+	if (mem->flags == 0)
+		return -ENODEV;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(p2sb_bar);
diff --git a/include/linux/platform_data/x86/p2sb.h b/include/linux/platform_data/x86/p2sb.h
new file mode 100644
index 000000000000..a1d5fddc8f13
--- /dev/null
+++ b/include/linux/platform_data/x86/p2sb.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Primary to Sideband (P2SB) bridge access support
+ */
+
+#ifndef _PLATFORM_DATA_X86_P2SB_H
+#define _PLATFORM_DATA_X86_P2SB_H
+
+#include <linux/errno.h>
+#include <linux/kconfig.h>
+
+struct pci_bus;
+struct resource;
+
+#if IS_BUILTIN(CONFIG_P2SB)
+
+int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem);
+
+#else /* CONFIG_P2SB */
+
+static inline int p2sb_bar(struct pci_bus *bus, unsigned int devfn, struct resource *mem)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_P2SB is not set */
+
+#endif /* _PLATFORM_DATA_X86_P2SB_H */
-- 
cgit v1.2.3


From 446f0cf9e08b483d7dc6f61eee0ee846b22f6386 Mon Sep 17 00:00:00 2001
From: Henning Schild <henning.schild@siemens.com>
Date: Mon, 6 Jun 2022 19:41:37 +0300
Subject: platform/x86: simatic-ipc: drop custom P2SB bar code

The two drivers that used to use this have been switched over to the
common P2SB accessor, so this code is not needed any longer.

Signed-off-by: Henning Schild <henning.schild@siemens.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/platform/x86/simatic-ipc.c                 | 38 ----------------------
 include/linux/platform_data/x86/simatic-ipc-base.h |  2 --
 2 files changed, 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/simatic-ipc.c b/drivers/platform/x86/simatic-ipc.c
index b599cda5ba3c..26c35e1660cb 100644
--- a/drivers/platform/x86/simatic-ipc.c
+++ b/drivers/platform/x86/simatic-ipc.c
@@ -101,44 +101,6 @@ static int register_platform_devices(u32 station_id)
 	return 0;
 }
 
-/* FIXME: this should eventually be done with generic P2SB discovery code
- * the individual drivers for watchdogs and LEDs access memory that implements
- * GPIO, but pinctrl will not come up because of missing ACPI entries
- *
- * While there is no conflict a cleaner solution would be to somehow bring up
- * pinctrl even with these ACPI entries missing, and base the drivers on pinctrl.
- * After which the following function could be dropped, together with the code
- * poking the memory.
- */
-/*
- * Get membase address from PCI, used in leds and wdt module. Here we read
- * the bar0. The final address calculation is done in the appropriate modules
- */
-u32 simatic_ipc_get_membase0(unsigned int p2sb)
-{
-	struct pci_bus *bus;
-	u32 bar0 = 0;
-	/*
-	 * The GPIO memory is in bar0 of the hidden P2SB device.
-	 * Unhide the device to have a quick look at it, before we hide it
-	 * again.
-	 * Also grab the pci rescan lock so that device does not get discovered
-	 * and remapped while it is visible.
-	 * This code is inspired by drivers/mfd/lpc_ich.c
-	 */
-	bus = pci_find_bus(0, 0);
-	pci_lock_rescan_remove();
-	pci_bus_write_config_byte(bus, p2sb, 0xE1, 0x0);
-	pci_bus_read_config_dword(bus, p2sb, PCI_BASE_ADDRESS_0, &bar0);
-
-	bar0 &= ~0xf;
-	pci_bus_write_config_byte(bus, p2sb, 0xE1, 0x1);
-	pci_unlock_rescan_remove();
-
-	return bar0;
-}
-EXPORT_SYMBOL(simatic_ipc_get_membase0);
-
 static int __init simatic_ipc_init_module(void)
 {
 	const struct dmi_system_id *match;
diff --git a/include/linux/platform_data/x86/simatic-ipc-base.h b/include/linux/platform_data/x86/simatic-ipc-base.h
index 62d2bc774067..39fefd48cf4d 100644
--- a/include/linux/platform_data/x86/simatic-ipc-base.h
+++ b/include/linux/platform_data/x86/simatic-ipc-base.h
@@ -24,6 +24,4 @@ struct simatic_ipc_platform {
 	u8	devmode;
 };
 
-u32 simatic_ipc_get_membase0(unsigned int p2sb);
-
 #endif /* __PLATFORM_DATA_X86_SIMATIC_IPC_BASE_H */
-- 
cgit v1.2.3


From b644c95598adfe2b968e97daee3a890dd2fda84d Mon Sep 17 00:00:00 2001
From: PaddyKP_Yao <PaddyKP_Yao@asus.com>
Date: Mon, 11 Jul 2022 19:51:25 +0800
Subject: platform/x86: asus-wmi: Add mic-mute LED classdev support

In some new ASUS devices, hotkey Fn+F13 is used for mic mute. If mic-mute
LED is present by checking WMI ASUS_WMI_DEVID_MICMUTE_LED, we will add a
mic-mute LED classdev, asus::micmute, in the asus-wmi driver to control
it. The binding of mic-mute LED controls will be swithched with LED
trigger.

Signed-off-by: PaddyKP_Yao <PaddyKP_Yao@asus.com>
Link: https://lore.kernel.org/r/20220711115125.2072508-1-PaddyKP_Yao@asus.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/Kconfig               |  2 ++
 drivers/platform/x86/asus-wmi.c            | 25 +++++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h |  1 +
 3 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 7fa88efeef4d..6a33c862452b 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -273,6 +273,8 @@ config ASUS_WMI
 	select INPUT_SPARSEKMAP
 	select LEDS_CLASS
 	select NEW_LEDS
+	select LEDS_TRIGGERS
+	select LEDS_TRIGGER_AUDIO
 	select ACPI_PLATFORM_PROFILE
 	help
 	  Say Y here if you have a WMI aware Asus laptop (like Eee PCs or new
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 62ce198a3463..89b604e04d7f 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -208,6 +208,7 @@ struct asus_wmi {
 	int kbd_led_wk;
 	struct led_classdev lightbar_led;
 	int lightbar_led_wk;
+	struct led_classdev micmute_led;
 	struct workqueue_struct *led_workqueue;
 	struct work_struct tpd_led_work;
 	struct work_struct wlan_led_work;
@@ -1028,12 +1029,23 @@ static enum led_brightness lightbar_led_get(struct led_classdev *led_cdev)
 	return result & ASUS_WMI_DSTS_LIGHTBAR_MASK;
 }
 
+static int micmute_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness brightness)
+{
+	int state = brightness != LED_OFF;
+	int err;
+
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_MICMUTE_LED, state, NULL);
+	return err < 0 ? err : 0;
+}
+
 static void asus_wmi_led_exit(struct asus_wmi *asus)
 {
 	led_classdev_unregister(&asus->kbd_led);
 	led_classdev_unregister(&asus->tpd_led);
 	led_classdev_unregister(&asus->wlan_led);
 	led_classdev_unregister(&asus->lightbar_led);
+	led_classdev_unregister(&asus->micmute_led);
 
 	if (asus->led_workqueue)
 		destroy_workqueue(asus->led_workqueue);
@@ -1105,6 +1117,19 @@ static int asus_wmi_led_init(struct asus_wmi *asus)
 					   &asus->lightbar_led);
 	}
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MICMUTE_LED)) {
+		asus->micmute_led.name = "asus::micmute";
+		asus->micmute_led.max_brightness = 1;
+		asus->micmute_led.brightness = ledtrig_audio_get(LED_AUDIO_MICMUTE);
+		asus->micmute_led.brightness_set_blocking = micmute_led_set;
+		asus->micmute_led.default_trigger = "audio-micmute";
+
+		rv = led_classdev_register(&asus->platform_device->dev,
+						&asus->micmute_led);
+		if (rv)
+			goto error;
+	}
+
 error:
 	if (rv)
 		asus_wmi_led_exit(asus);
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index a571b47ff362..98f2b2f20f3e 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -49,6 +49,7 @@
 #define ASUS_WMI_DEVID_LED4		0x00020014
 #define ASUS_WMI_DEVID_LED5		0x00020015
 #define ASUS_WMI_DEVID_LED6		0x00020016
+#define ASUS_WMI_DEVID_MICMUTE_LED		0x00040017
 
 /* Backlight and Brightness */
 #define ASUS_WMI_DEVID_ALS_ENABLE	0x00050001 /* Ambient Light Sensor */
-- 
cgit v1.2.3


From cd89edda4002b7fb3c0a6765c3a60a60d5b1dc16 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Thu, 14 Jul 2022 20:42:12 +0800
Subject: PCI: loongson: Add ACPI init support

Loongson PCH (LS7A chipset) will be used by both MIPS-based and LoongArch-
based Loongson processors. MIPS-based Loongson uses FDT, while LoongArch-
based Loongson uses ACPI.

Add ACPI init support for the driver in pci-loongson.c because it is
currently FDT-only.

LoongArch is a new RISC ISA, mainline support will come soon, and
documentations are here (in translation):

  https://github.com/loongson/LoongArch-Documentation

Link: https://lore.kernel.org/r/20220714124216.1489304-4-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_mcfg.c               | 10 ++++
 drivers/pci/controller/Kconfig        |  2 +-
 drivers/pci/controller/pci-loongson.c | 94 ++++++++++++++++++++++++++++-------
 include/linux/pci-ecam.h              |  1 +
 4 files changed, 87 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
index 63b98eae5e75..860014b89b8e 100644
--- a/drivers/acpi/pci_mcfg.c
+++ b/drivers/acpi/pci_mcfg.c
@@ -172,6 +172,16 @@ static struct mcfg_fixup mcfg_quirks[] = {
 	ALTRA_ECAM_QUIRK(1, 14),
 	ALTRA_ECAM_QUIRK(1, 15),
 #endif /* ARM64 */
+
+#ifdef CONFIG_LOONGARCH
+#define LOONGSON_ECAM_MCFG(table_id, seg) \
+	{ "LOONGS", table_id, 1, seg, MCFG_BUS_ANY, &loongson_pci_ecam_ops }
+
+	LOONGSON_ECAM_MCFG("\0", 0),
+	LOONGSON_ECAM_MCFG("LOONGSON", 0),
+	LOONGSON_ECAM_MCFG("\0", 1),
+	LOONGSON_ECAM_MCFG("LOONGSON", 1),
+#endif /* LOONGARCH */
 };
 
 static char mcfg_oem_id[ACPI_OEM_ID_SIZE];
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index b8d96d38064d..9dbd73898b47 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -293,7 +293,7 @@ config PCI_HYPERV_INTERFACE
 config PCI_LOONGSON
 	bool "LOONGSON PCI Controller"
 	depends on MACH_LOONGSON64 || COMPILE_TEST
-	depends on OF
+	depends on OF || ACPI
 	depends on PCI_QUIRKS
 	default MACH_LOONGSON64
 	help
diff --git a/drivers/pci/controller/pci-loongson.c b/drivers/pci/controller/pci-loongson.c
index 565453882ffe..cd29800974e7 100644
--- a/drivers/pci/controller/pci-loongson.c
+++ b/drivers/pci/controller/pci-loongson.c
@@ -9,6 +9,8 @@
 #include <linux/of_pci.h>
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
 
 #include "../pci.h"
 
@@ -97,39 +99,53 @@ static void loongson_mrrs_quirk(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_ENABLE(PCI_ANY_ID, PCI_ANY_ID, loongson_mrrs_quirk);
 
-static void __iomem *cfg1_map(struct loongson_pci *priv, int bus,
-				unsigned int devfn, int where)
+static struct loongson_pci *pci_bus_to_loongson_pci(struct pci_bus *bus)
 {
-	unsigned long addroff = 0x0;
+	struct pci_config_window *cfg;
 
-	if (bus != 0)
-		addroff |= BIT(28); /* Type 1 Access */
-	addroff |= (where & 0xff) | ((where & 0xf00) << 16);
-	addroff |= (bus << 16) | (devfn << 8);
-	return priv->cfg1_base + addroff;
+	if (acpi_disabled)
+		return (struct loongson_pci *)(bus->sysdata);
+
+	cfg = bus->sysdata;
+	return (struct loongson_pci *)(cfg->priv);
 }
 
-static void __iomem *cfg0_map(struct loongson_pci *priv, int bus,
-				unsigned int devfn, int where)
+static void __iomem *cfg0_map(struct loongson_pci *priv, struct pci_bus *bus,
+			      unsigned int devfn, int where)
 {
 	unsigned long addroff = 0x0;
+	unsigned char busnum = bus->number;
 
-	if (bus != 0)
+	if (!pci_is_root_bus(bus)) {
 		addroff |= BIT(24); /* Type 1 Access */
-	addroff |= (bus << 16) | (devfn << 8) | where;
+		addroff |= (busnum << 16);
+	}
+	addroff |= (devfn << 8) | where;
 	return priv->cfg0_base + addroff;
 }
 
-static void __iomem *pci_loongson_map_bus(struct pci_bus *bus, unsigned int devfn,
-			       int where)
+static void __iomem *cfg1_map(struct loongson_pci *priv, struct pci_bus *bus,
+			      unsigned int devfn, int where)
 {
+	unsigned long addroff = 0x0;
 	unsigned char busnum = bus->number;
-	struct pci_host_bridge *bridge = pci_find_host_bridge(bus);
-	struct loongson_pci *priv =  pci_host_bridge_priv(bridge);
+
+	if (!pci_is_root_bus(bus)) {
+		addroff |= BIT(28); /* Type 1 Access */
+		addroff |= (busnum << 16);
+	}
+	addroff |= (devfn << 8) | (where & 0xff) | ((where & 0xf00) << 16);
+	return priv->cfg1_base + addroff;
+}
+
+static void __iomem *pci_loongson_map_bus(struct pci_bus *bus,
+					  unsigned int devfn, int where)
+{
+	struct loongson_pci *priv = pci_bus_to_loongson_pci(bus);
 
 	/*
 	 * Do not read more than one device on the bus other than
-	 * the host bus. For our hardware the root bus is always bus 0.
+	 * the host bus.
 	 */
 	if (priv->data->flags & FLAG_DEV_FIX &&
 			!pci_is_root_bus(bus) && PCI_SLOT(devfn) > 0)
@@ -137,15 +153,17 @@ static void __iomem *pci_loongson_map_bus(struct pci_bus *bus, unsigned int devf
 
 	/* CFG0 can only access standard space */
 	if (where < PCI_CFG_SPACE_SIZE && priv->cfg0_base)
-		return cfg0_map(priv, busnum, devfn, where);
+		return cfg0_map(priv, bus, devfn, where);
 
 	/* CFG1 can access extended space */
 	if (where < PCI_CFG_SPACE_EXP_SIZE && priv->cfg1_base)
-		return cfg1_map(priv, busnum, devfn, where);
+		return cfg1_map(priv, bus, devfn, where);
 
 	return NULL;
 }
 
+#ifdef CONFIG_OF
+
 static int loongson_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
 	int irq;
@@ -259,3 +277,41 @@ static struct platform_driver loongson_pci_driver = {
 	.probe = loongson_pci_probe,
 };
 builtin_platform_driver(loongson_pci_driver);
+
+#endif
+
+#ifdef CONFIG_ACPI
+
+static int loongson_pci_ecam_init(struct pci_config_window *cfg)
+{
+	struct device *dev = cfg->parent;
+	struct loongson_pci *priv;
+	struct loongson_pci_data *data;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	cfg->priv = priv;
+	data->flags = FLAG_CFG1;
+	priv->data = data;
+	priv->cfg1_base = cfg->win - (cfg->busr.start << 16);
+
+	return 0;
+}
+
+const struct pci_ecam_ops loongson_pci_ecam_ops = {
+	.bus_shift = 16,
+	.init	   = loongson_pci_ecam_init,
+	.pci_ops   = {
+		.map_bus = pci_loongson_map_bus,
+		.read	 = pci_generic_config_read,
+		.write	 = pci_generic_config_write,
+	}
+};
+
+#endif
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index adea5a4771cf..6b1301e2498e 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -87,6 +87,7 @@ extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 *
 extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 extern const struct pci_ecam_ops al_pcie_ops;	/* Amazon Annapurna Labs PCIe */
 extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */
+extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */
 #endif
 
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
-- 
cgit v1.2.3


From e2863a78593d638d3924a6f67900c4820034f349 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:24 -0700
Subject: lib/bitmap: change return types to bool where appropriate

Some bitmap functions return boolean results in int variables. Fix it
by changing return types to bool.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitmap.h       | 8 ++++----
 lib/bitmap.c                 | 4 ++--
 tools/include/linux/bitmap.h | 8 ++++----
 tools/lib/bitmap.c           | 2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index c91638e507f2..e1a438bdda52 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -148,13 +148,13 @@ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
 			 unsigned int shift, unsigned int nbits);
 void bitmap_cut(unsigned long *dst, const unsigned long *src,
 		unsigned int first, unsigned int cut, unsigned int nbits);
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 		  const unsigned long *bitmap2, unsigned int nbits);
-int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
 		    const unsigned long *bitmap2, unsigned int nbits);
 void __bitmap_replace(unsigned long *dst,
 		      const unsigned long *old, const unsigned long *new,
@@ -315,7 +315,7 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
 	bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
 #endif
 
-static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
@@ -341,7 +341,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 		__bitmap_xor(dst, src1, src2, nbits);
 }
 
-static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
diff --git a/lib/bitmap.c b/lib/bitmap.c
index e903e13c62e1..9bc80f5bf149 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -237,7 +237,7 @@ void bitmap_cut(unsigned long *dst, const unsigned long *src,
 }
 EXPORT_SYMBOL(bitmap_cut);
 
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 				const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
@@ -275,7 +275,7 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_xor);
 
-int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
 				const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index afdf93bebaaf..2ae7ab8ed7d1 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -14,7 +14,7 @@
 int __bitmap_weight(const unsigned long *bitmap, int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int bits);
 bool __bitmap_equal(const unsigned long *bitmap1,
 		    const unsigned long *bitmap2, unsigned int bits);
@@ -45,7 +45,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 	dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
 }
 
-static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
+static inline bool bitmap_empty(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
@@ -53,7 +53,7 @@ static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
 	return find_first_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
+static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
@@ -146,7 +146,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
  * @src2: operand 2
  * @nbits: size of bitmap
  */
-static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
+static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1,
 			     const unsigned long *src2, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 354f8cdc0880..2e351d63fdba 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -57,7 +57,7 @@ size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits,
 	return ret;
 }
 
-int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int bits)
 {
 	unsigned int k;
-- 
cgit v1.2.3


From 309c56e84602d894e7ca424b0b13837117568fca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:13 +0200
Subject: iommu: remove the unused dev_has_feat method

This method is never actually called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-2-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 -
 include/linux/iommu.h                       | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d9c1623ec1a9..1b6c17dd81ee 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-	.dev_has_feat		= arm_smmu_dev_has_feature,
 	.dev_feat_enabled	= arm_smmu_dev_feature_enabled,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e6abd998dbe7..a3acdb46b939 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -164,8 +164,7 @@ struct iommu_iort_rmr_data {
  *			 supported, this feature must be enabled before and
  *			 disabled after %IOMMU_DEV_FEAT_SVA.
  *
- * Device drivers query whether a feature is supported using
- * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
+ * Device drivers enable a feature using iommu_dev_enable_feature().
  */
 enum iommu_dev_features {
 	IOMMU_DEV_FEAT_SVA,
@@ -248,7 +247,6 @@ struct iommu_ops {
 	bool (*is_attach_deferred)(struct device *dev);
 
 	/* Per device IOMMU features */
-	bool (*dev_has_feat)(struct device *dev, enum iommu_dev_features f);
 	bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
-- 
cgit v1.2.3


From a871765d5588531e1972902d1ae077b8be306c94 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:14 +0200
Subject: iommu: remove iommu_dev_feature_enabled

Remove the unused iommu_dev_feature_enabled function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-3-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  1 -
 drivers/iommu/iommu.c                       | 13 -------------
 include/linux/iommu.h                       |  9 ---------
 3 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1b6c17dd81ee..4d30a8d2bc23 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
-	.dev_feat_enabled	= arm_smmu_dev_feature_enabled,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
 	.sva_bind		= arm_smmu_sva_bind,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0aa141646bdf..1bb016a6a2aa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2760,19 +2760,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 }
 EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
-bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-	if (dev->iommu && dev->iommu->iommu_dev) {
-		const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
-
-		if (ops->dev_feat_enabled)
-			return ops->dev_feat_enabled(dev, feat);
-	}
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled);
-
 /**
  * iommu_sva_bind_device() - Bind a process address space to a device
  * @dev: the device
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a3acdb46b939..0bc2eb14b026 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -215,7 +215,6 @@ struct iommu_iotlb_gather {
  *                      driver init to device driver init (default no)
  * @dev_has/enable/disable_feat: per device entries to check/enable/disable
  *                               iommu specific features.
- * @dev_feat_enabled: check enabled feature
  * @sva_bind: Bind process address space to device
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
@@ -247,7 +246,6 @@ struct iommu_ops {
 	bool (*is_attach_deferred)(struct device *dev);
 
 	/* Per device IOMMU features */
-	bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
 
@@ -670,7 +668,6 @@ void iommu_release_device(struct device *dev);
 
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f);
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f);
-bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f);
 
 struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm,
@@ -997,12 +994,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static inline bool
-iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat)
-{
-	return false;
-}
-
 static inline int
 iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-- 
cgit v1.2.3


From ae3ff39a51a0f5843960487962e110339f321b0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 8 Jul 2022 10:06:15 +0200
Subject: iommu: remove the put_resv_regions method

All drivers that implement get_resv_regions just use
generic_put_resv_regions to implement the put side.  Remove the
indirections and document the allocations constraints.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20220708080616.238833-4-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c                   |  1 -
 drivers/iommu/apple-dart.c                  |  1 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  1 -
 drivers/iommu/arm/arm-smmu/arm-smmu.c       |  1 -
 drivers/iommu/intel/iommu.c                 |  1 -
 drivers/iommu/iommu.c                       | 21 ++++-----------------
 drivers/iommu/mtk_iommu.c                   |  1 -
 drivers/iommu/virtio-iommu.c                |  5 ++---
 include/linux/iommu.h                       |  4 ----
 9 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 840831d5d2ad..e66e071e8c3b 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2280,7 +2280,6 @@ const struct iommu_ops amd_iommu_ops = {
 	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
 	.get_resv_regions = amd_iommu_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
 	.def_domain_type = amd_iommu_def_domain_type,
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index e87d3cf54ed6..1b1725759262 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -768,7 +768,6 @@ static const struct iommu_ops apple_dart_iommu_ops = {
 	.of_xlate = apple_dart_of_xlate,
 	.def_domain_type = apple_dart_def_domain_type,
 	.get_resv_regions = apple_dart_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.pgsize_bitmap = -1UL, /* Restricted during dart probe */
 	.owner = THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 4d30a8d2bc23..4a5e435567f1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2852,7 +2852,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.dev_enable_feat	= arm_smmu_dev_enable_feature,
 	.dev_disable_feat	= arm_smmu_dev_disable_feature,
 	.sva_bind		= arm_smmu_sva_bind,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 588929bed1bc..2d4129a4ccfc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1584,7 +1584,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.def_domain_type	= arm_smmu_def_domain_type,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
 	.owner			= THIS_MODULE,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 44016594831d..49d616aa2148 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4911,7 +4911,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.probe_finalize		= intel_iommu_probe_finalize,
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.device_group		= intel_iommu_device_group,
 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 1bb016a6a2aa..f53f8b2d27a5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2579,27 +2579,14 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 		ops->get_resv_regions(dev, list);
 }
 
-void iommu_put_resv_regions(struct device *dev, struct list_head *list)
-{
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
-
-	if (ops->put_resv_regions)
-		ops->put_resv_regions(dev, list);
-}
-
 /**
- * generic_iommu_put_resv_regions - Reserved region driver helper
+ * iommu_put_resv_regions - release resered regions
  * @dev: device for which to free reserved regions
  * @list: reserved region list for device
  *
- * IOMMU drivers can use this to implement their .put_resv_regions() callback
- * for simple reservations. If a per region callback is provided that will be
- * used to free all memory allocations associated with the reserved region or
- * else just free up the memory for the regions. If an IOMMU driver allocates
- * additional resources per region, it is going to have to implement a custom
- * callback.
+ * This releases a reserved region list acquired by iommu_get_resv_regions().
  */
-void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
+void iommu_put_resv_regions(struct device *dev, struct list_head *list)
 {
 	struct iommu_resv_region *entry, *next;
 
@@ -2610,7 +2597,7 @@ void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list)
 			kfree(entry);
 	}
 }
-EXPORT_SYMBOL(generic_iommu_put_resv_regions);
+EXPORT_SYMBOL(iommu_put_resv_regions);
 
 struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start,
 						  size_t length, int prot,
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 5c3d9366c25c..95fd21c7207a 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -928,7 +928,6 @@ static const struct iommu_ops mtk_iommu_ops = {
 	.device_group	= mtk_iommu_device_group,
 	.of_xlate	= mtk_iommu_of_xlate,
 	.get_resv_regions = mtk_iommu_get_resv_regions,
-	.put_resv_regions = generic_iommu_put_resv_regions,
 	.pgsize_bitmap	= SZ_4K | SZ_64K | SZ_1M | SZ_16M,
 	.owner		= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 55337796a5f8..feeb5fde72a3 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -964,7 +964,7 @@ static struct iommu_device *viommu_probe_device(struct device *dev)
 	return &viommu->iommu;
 
 err_free_dev:
-	generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+	iommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 
 	return ERR_PTR(ret);
@@ -983,7 +983,7 @@ static void viommu_release_device(struct device *dev)
 {
 	struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
 
-	generic_iommu_put_resv_regions(dev, &vdev->resv_regions);
+	iommu_put_resv_regions(dev, &vdev->resv_regions);
 	kfree(vdev);
 }
 
@@ -1007,7 +1007,6 @@ static struct iommu_ops viommu_ops = {
 	.release_device		= viommu_release_device,
 	.device_group		= viommu_device_group,
 	.get_resv_regions	= viommu_get_resv_regions,
-	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.of_xlate		= viommu_of_xlate,
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0bc2eb14b026..ea30f00dc145 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -209,7 +209,6 @@ struct iommu_iotlb_gather {
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
  * @get_resv_regions: Request list of reserved regions for a device
- * @put_resv_regions: Free list of reserved regions for a device
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -240,7 +239,6 @@ struct iommu_ops {
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
-	void (*put_resv_regions)(struct device *dev, struct list_head *list);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct device *dev);
@@ -454,8 +452,6 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 
 extern void iommu_get_resv_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_resv_regions(struct device *dev, struct list_head *list);
-extern void generic_iommu_put_resv_regions(struct device *dev,
-					   struct list_head *list);
 extern void iommu_set_default_passthrough(bool cmd_line);
 extern void iommu_set_default_translated(bool cmd_line);
 extern bool iommu_default_passthrough(void);
-- 
cgit v1.2.3


From f9903555dd05a4096d8fef4eb823c0b94f710982 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:46 +0800
Subject: iommu/vt-d: Remove unnecessary exported symbol

The exported symbol intel_iommu_gfx_mapped is not used anywhere in the
tree. Remove it to avoid dead code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 6 ------
 include/linux/intel-iommu.h | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5c0dce78586a..6564af7ec0dd 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -314,9 +314,6 @@ static int iommu_skip_te_disable;
 #define IDENTMAP_GFX		2
 #define IDENTMAP_AZALIA		4
 
-int intel_iommu_gfx_mapped;
-EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
-
 DEFINE_SPINLOCK(device_domain_lock);
 static LIST_HEAD(device_domain_list);
 
@@ -4093,9 +4090,6 @@ int __init intel_iommu_init(void)
 	if (list_empty(&dmar_satc_units))
 		pr_info("No SATC found\n");
 
-	if (dmar_map_gfx)
-		intel_iommu_gfx_mapped = 1;
-
 	init_no_remapping_devices();
 
 	ret = init_dmars();
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5fcf89faa31a..4ebf3c4da45e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -787,7 +787,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
-extern int intel_iommu_gfx_mapped;
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
-- 
cgit v1.2.3


From 853788b9a66ff089053df3b4ed7d166e61def449 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:49 +0800
Subject: x86/boot/tboot: Move tboot_force_iommu() to Intel IOMMU

tboot_force_iommu() is only called by the Intel IOMMU driver. Move the
helper into that driver. No functional change intended.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/kernel/tboot.c     | 15 ---------------
 drivers/iommu/intel/iommu.c | 14 ++++++++++++++
 include/linux/tboot.h       |  2 --
 3 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 0c1154a1c403..3bacd935f840 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -6,7 +6,6 @@
  * Copyright (c) 2006-2009, Intel Corporation
  */
 
-#include <linux/intel-iommu.h>
 #include <linux/init_task.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
@@ -516,17 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb
 
 	return dmar_tbl;
 }
-
-int tboot_force_iommu(void)
-{
-	if (!tboot_enabled())
-		return 0;
-
-	if (no_iommu || dmar_disabled)
-		pr_warn("Forcing Intel-IOMMU to enabled\n");
-
-	dmar_disabled = 0;
-	no_iommu = 0;
-
-	return 1;
-}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6564af7ec0dd..c70948021e8e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4019,6 +4019,20 @@ static int __init probe_acpi_namespace_devices(void)
 	return 0;
 }
 
+static __init int tboot_force_iommu(void)
+{
+	if (!tboot_enabled())
+		return 0;
+
+	if (no_iommu || dmar_disabled)
+		pr_warn("Forcing Intel-IOMMU to enabled\n");
+
+	dmar_disabled = 0;
+	no_iommu = 0;
+
+	return 1;
+}
+
 int __init intel_iommu_init(void)
 {
 	int ret = -ENODEV;
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index 5146d2574e85..d2279160ef39 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -126,7 +126,6 @@ extern void tboot_probe(void);
 extern void tboot_shutdown(u32 shutdown_type);
 extern struct acpi_table_header *tboot_get_dmar_table(
 				      struct acpi_table_header *dmar_tbl);
-extern int tboot_force_iommu(void);
 
 #else
 
@@ -136,7 +135,6 @@ extern int tboot_force_iommu(void);
 #define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
 					do { } while (0)
 #define tboot_get_dmar_table(dmar_tbl)	(dmar_tbl)
-#define tboot_force_iommu()		0
 
 #endif /* !CONFIG_INTEL_TXT */
 
-- 
cgit v1.2.3


From 2585a2790e7fdb3dadfe309c9220bbc03704f84f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:08:50 +0800
Subject: iommu/vt-d: Move include/linux/intel-iommu.h under iommu

This header file is private to the Intel IOMMU driver. Move it to the
driver folder.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/r/20220514014322.2927339-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 MAINTAINERS                         |   1 -
 drivers/iommu/intel/cap_audit.c     |   2 +-
 drivers/iommu/intel/debugfs.c       |   2 +-
 drivers/iommu/intel/dmar.c          |   2 +-
 drivers/iommu/intel/iommu.c         |   2 +-
 drivers/iommu/intel/iommu.h         | 831 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/irq_remapping.c |   2 +-
 drivers/iommu/intel/pasid.c         |   2 +-
 drivers/iommu/intel/perf.c          |   2 +-
 drivers/iommu/intel/svm.c           |   2 +-
 drivers/iommu/intel/trace.h         |   3 +-
 include/linux/intel-iommu.h         | 831 ------------------------------------
 12 files changed, 841 insertions(+), 841 deletions(-)
 create mode 100644 drivers/iommu/intel/iommu.h
 delete mode 100644 include/linux/intel-iommu.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index f679152bdbad..8f9ed151ad4c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10077,7 +10077,6 @@ L:	iommu@lists.linux.dev
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
 F:	drivers/iommu/intel/
-F:	include/linux/intel-iommu.h
 F:	include/linux/intel-svm.h
 
 INTEL IOP-ADMA DMA DRIVER
diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c
index 71596fc62822..3ee68393122f 100644
--- a/drivers/iommu/intel/cap_audit.c
+++ b/drivers/iommu/intel/cap_audit.c
@@ -10,7 +10,7 @@
 
 #define pr_fmt(fmt)	"DMAR: " fmt
 
-#include <linux/intel-iommu.h>
+#include "iommu.h"
 #include "cap_audit.h"
 
 static u64 intel_iommu_cap_sanity;
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index ed796eea4581..d927ef10641b 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -10,11 +10,11 @@
 
 #include <linux/debugfs.h>
 #include <linux/dmar.h>
-#include <linux/intel-iommu.h>
 #include <linux/pci.h>
 
 #include <asm/irq_remapping.h>
 
+#include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
 
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index f91b45be1d92..2a5e0f91e647 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -19,7 +19,6 @@
 #include <linux/pci.h>
 #include <linux/dmar.h>
 #include <linux/iova.h>
-#include <linux/intel-iommu.h>
 #include <linux/timer.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -31,6 +30,7 @@
 #include <linux/limits.h>
 #include <asm/irq_remapping.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "perf.h"
 #include "trace.h"
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c70948021e8e..10bda4bec8fe 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -17,7 +17,6 @@
 #include <linux/dma-direct.h>
 #include <linux/dma-iommu.h>
 #include <linux/dmi.h>
-#include <linux/intel-iommu.h>
 #include <linux/intel-svm.h>
 #include <linux/memory.h>
 #include <linux/pci.h>
@@ -26,6 +25,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/tboot.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "../iommu-sva-lib.h"
 #include "pasid.h"
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
new file mode 100644
index 000000000000..4ebf3c4da45e
--- /dev/null
+++ b/drivers/iommu/intel/iommu.h
@@ -0,0 +1,831 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2006-2015, Intel Corporation.
+ *
+ * Authors: Ashok Raj <ashok.raj@intel.com>
+ *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *          David Woodhouse <David.Woodhouse@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <linux/types.h>
+#include <linux/iova.h>
+#include <linux/io.h>
+#include <linux/idr.h>
+#include <linux/mmu_notifier.h>
+#include <linux/list.h>
+#include <linux/iommu.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/dmar.h>
+#include <linux/ioasid.h>
+#include <linux/bitfield.h>
+
+#include <asm/cacheflush.h>
+#include <asm/iommu.h>
+
+/*
+ * VT-d hardware uses 4KiB page size regardless of host page size.
+ */
+#define VTD_PAGE_SHIFT		(12)
+#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
+#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
+#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
+
+#define VTD_STRIDE_SHIFT        (9)
+#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
+
+#define DMA_PTE_READ		BIT_ULL(0)
+#define DMA_PTE_WRITE		BIT_ULL(1)
+#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
+#define DMA_PTE_SNP		BIT_ULL(11)
+
+#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
+#define DMA_FL_PTE_US		BIT_ULL(2)
+#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
+#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
+#define DMA_FL_PTE_XD		BIT_ULL(63)
+
+#define ADDR_WIDTH_5LEVEL	(57)
+#define ADDR_WIDTH_4LEVEL	(48)
+
+#define CONTEXT_TT_MULTI_LEVEL	0
+#define CONTEXT_TT_DEV_IOTLB	1
+#define CONTEXT_TT_PASS_THROUGH 2
+#define CONTEXT_PASIDE		BIT_ULL(3)
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
+#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
+#define DMAR_PQH_REG	0xc0	/* Page request queue head register */
+#define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
+#define DMAR_PQA_REG	0xd0	/* Page request queue address register */
+#define DMAR_PRS_REG	0xdc	/* Page request status register */
+#define DMAR_PECTL_REG	0xe0	/* Page request event control register */
+#define	DMAR_PEDATA_REG	0xe4	/* Page request event interrupt data register */
+#define	DMAR_PEADDR_REG	0xe8	/* Page request event interrupt addr register */
+#define	DMAR_PEUADDR_REG 0xec	/* Page request event Upper address register */
+#define DMAR_MTRRCAP_REG 0x100	/* MTRR capability register */
+#define DMAR_MTRRDEF_REG 0x108	/* MTRR default type register */
+#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */
+#define DMAR_MTRR_FIX16K_80000_REG 0x128
+#define DMAR_MTRR_FIX16K_A0000_REG 0x130
+#define DMAR_MTRR_FIX4K_C0000_REG 0x138
+#define DMAR_MTRR_FIX4K_C8000_REG 0x140
+#define DMAR_MTRR_FIX4K_D0000_REG 0x148
+#define DMAR_MTRR_FIX4K_D8000_REG 0x150
+#define DMAR_MTRR_FIX4K_E0000_REG 0x158
+#define DMAR_MTRR_FIX4K_E8000_REG 0x160
+#define DMAR_MTRR_FIX4K_F0000_REG 0x168
+#define DMAR_MTRR_FIX4K_F8000_REG 0x170
+#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */
+#define DMAR_MTRR_PHYSMASK0_REG 0x188
+#define DMAR_MTRR_PHYSBASE1_REG 0x190
+#define DMAR_MTRR_PHYSMASK1_REG 0x198
+#define DMAR_MTRR_PHYSBASE2_REG 0x1a0
+#define DMAR_MTRR_PHYSMASK2_REG 0x1a8
+#define DMAR_MTRR_PHYSBASE3_REG 0x1b0
+#define DMAR_MTRR_PHYSMASK3_REG 0x1b8
+#define DMAR_MTRR_PHYSBASE4_REG 0x1c0
+#define DMAR_MTRR_PHYSMASK4_REG 0x1c8
+#define DMAR_MTRR_PHYSBASE5_REG 0x1d0
+#define DMAR_MTRR_PHYSMASK5_REG 0x1d8
+#define DMAR_MTRR_PHYSBASE6_REG 0x1e0
+#define DMAR_MTRR_PHYSMASK6_REG 0x1e8
+#define DMAR_MTRR_PHYSBASE7_REG 0x1f0
+#define DMAR_MTRR_PHYSMASK7_REG 0x1f8
+#define DMAR_MTRR_PHYSBASE8_REG 0x200
+#define DMAR_MTRR_PHYSMASK8_REG 0x208
+#define DMAR_MTRR_PHYSBASE9_REG 0x210
+#define DMAR_MTRR_PHYSMASK9_REG 0x218
+#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
+#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
+#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
+
+#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
+
+#define OFFSET_STRIDE		(9)
+
+#define dmar_readq(a) readq(a)
+#define dmar_writeq(a,v) writeq(v,a)
+#define dmar_readl(a) readl(a)
+#define dmar_writel(a, v) writel(v, a)
+
+#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+#define DMAR_VER_MINOR(v)		((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_5lp_support(c)	(((c) >> 60) & 1)
+#define cap_pi_support(c)	(((c) >> 59) & 1)
+#define cap_fl1gp_support(c)	(((c) >> 56) & 1)
+#define cap_read_drain(c)	(((c) >> 55) & 1)
+#define cap_write_drain(c)	(((c) >> 54) & 1)
+#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
+
+#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
+#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
+					* OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
+#define cap_max_fault_reg_offset(c) \
+	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
+
+#define cap_zlr(c)		(((c) >> 22) & 1)
+#define cap_isoch(c)		(((c) >> 23) & 1)
+#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)	(((c) >> 7) & 1)
+#define cap_phmr(c)		(((c) >> 6) & 1)
+#define cap_plmr(c)		(((c) >> 5) & 1)
+#define cap_rwbf(c)		(((c) >> 4) & 1)
+#define cap_afl(c)		(((c) >> 3) & 1)
+#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
+/*
+ * Extended Capability Register
+ */
+
+#define	ecap_rps(e)		(((e) >> 49) & 0x1)
+#define ecap_smpwc(e)		(((e) >> 48) & 0x1)
+#define ecap_flts(e)		(((e) >> 47) & 0x1)
+#define ecap_slts(e)		(((e) >> 46) & 0x1)
+#define ecap_slads(e)		(((e) >> 45) & 0x1)
+#define ecap_vcs(e)		(((e) >> 44) & 0x1)
+#define ecap_smts(e)		(((e) >> 43) & 0x1)
+#define ecap_dit(e)		(((e) >> 41) & 0x1)
+#define ecap_pds(e)		(((e) >> 42) & 0x1)
+#define ecap_pasid(e)		(((e) >> 40) & 0x1)
+#define ecap_pss(e)		(((e) >> 35) & 0x1f)
+#define ecap_eafs(e)		(((e) >> 34) & 0x1)
+#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
+#define ecap_srs(e)		(((e) >> 31) & 0x1)
+#define ecap_ers(e)		(((e) >> 30) & 0x1)
+#define ecap_prs(e)		(((e) >> 29) & 0x1)
+#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
+#define ecap_dis(e)		(((e) >> 27) & 0x1)
+#define ecap_nest(e)		(((e) >> 26) & 0x1)
+#define ecap_mts(e)		(((e) >> 25) & 0x1)
+#define ecap_ecs(e)		(((e) >> 24) & 0x1)
+#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
+#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
+#define ecap_coherent(e)	((e) & 0x1)
+#define ecap_qis(e)		((e) & 0x2)
+#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
+#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
+#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
+#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
+#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
+
+/* Virtual command interface capability */
+#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
+
+/* IOTLB_REG */
+#define DMA_TLB_FLUSH_GRANU_OFFSET  60
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
+#define DMA_TLB_IVT (((u64)1) << 63)
+#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
+#define DMA_TLB_MAX_SIZE (0x3f)
+
+/* INVALID_DESC */
+#define DMA_CCMD_INVL_GRANU_OFFSET  61
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
+#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
+#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
+#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
+#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
+#define DMA_ID_TLB_ADDR(addr)	(addr)
+#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM (((u32)1)<<31)
+#define DMA_PMEN_PRS (((u32)1)<<0)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE (((u32)1) << 31)
+#define DMA_GCMD_SRTP (((u32)1) << 30)
+#define DMA_GCMD_SFL (((u32)1) << 29)
+#define DMA_GCMD_EAFL (((u32)1) << 28)
+#define DMA_GCMD_WBF (((u32)1) << 27)
+#define DMA_GCMD_QIE (((u32)1) << 26)
+#define DMA_GCMD_SIRTP (((u32)1) << 24)
+#define DMA_GCMD_IRE (((u32) 1) << 25)
+#define DMA_GCMD_CFI (((u32) 1) << 23)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES (((u32)1) << 31)
+#define DMA_GSTS_RTPS (((u32)1) << 30)
+#define DMA_GSTS_FLS (((u32)1) << 29)
+#define DMA_GSTS_AFLS (((u32)1) << 28)
+#define DMA_GSTS_WBFS (((u32)1) << 27)
+#define DMA_GSTS_QIES (((u32)1) << 26)
+#define DMA_GSTS_IRTPS (((u32)1) << 24)
+#define DMA_GSTS_IRES (((u32)1) << 25)
+#define DMA_GSTS_CFIS (((u32)1) << 23)
+
+/* DMA_RTADDR_REG */
+#define DMA_RTADDR_RTT (((u64)1) << 11)
+#define DMA_RTADDR_SMT (((u64)1) << 10)
+
+/* CCMD_REG */
+#define DMA_CCMD_ICC (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u32)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
+#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
+#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
+#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
+#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
+#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u32)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff)
+#define dma_frcd_pasid_present(c) (((c) >> 31) & 1)
+/* low 64 bit */
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
+
+/* PRS_REG */
+#define DMA_PRS_PPR	((u32)1)
+#define DMA_PRS_PRO	((u32)2)
+
+#define DMA_VCS_PAS	((u64)1)
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
+do {									\
+	cycles_t start_time = get_cycles();				\
+	while (1) {							\
+		sts = op(iommu->reg + offset);				\
+		if (cond)						\
+			break;						\
+		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+			panic("DMAR hardware is malfunctioning\n");	\
+		cpu_relax();						\
+	}								\
+} while (0)
+
+#define QI_LENGTH	256	/* queue length */
+
+enum {
+	QI_FREE,
+	QI_IN_USE,
+	QI_DONE,
+	QI_ABORT
+};
+
+#define QI_CC_TYPE		0x1
+#define QI_IOTLB_TYPE		0x2
+#define QI_DIOTLB_TYPE		0x3
+#define QI_IEC_TYPE		0x4
+#define QI_IWD_TYPE		0x5
+#define QI_EIOTLB_TYPE		0x6
+#define QI_PC_TYPE		0x7
+#define QI_DEIOTLB_TYPE		0x8
+#define QI_PGRP_RESP_TYPE	0x9
+#define QI_PSTRM_RESP_TYPE	0xa
+
+#define QI_IEC_SELECTIVE	(((u64)1) << 4)
+#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
+#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
+
+#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
+#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+#define QI_IWD_FENCE		(((u64)1) << 6)
+#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
+
+#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
+#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
+#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
+#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
+#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
+#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
+#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
+
+#define QI_CC_FM(fm)		(((u64)fm) << 48)
+#define QI_CC_SID(sid)		(((u64)sid) << 32)
+#define QI_CC_DID(did)		(((u64)did) << 16)
+#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
+
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
+				   ((u64)((pfsid >> 4) & 0xfff) << 52))
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
+#define QI_PC_PASID(pasid)	(((u64)pasid) << 32)
+#define QI_PC_DID(did)		(((u64)did) << 16)
+#define QI_PC_GRAN(gran)	(((u64)gran) << 4)
+
+/* PASID cache invalidation granu */
+#define QI_PC_ALL_PASIDS	0
+#define QI_PC_PASID_SEL		1
+#define QI_PC_GLOBAL		3
+
+#define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
+#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
+#define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
+#define QI_EIOTLB_DID(did)	(((u64)did) << 16)
+#define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
+
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL		1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
+
+#define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
+#define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
+#define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
+#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
+#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
+				    ((u64)((pfsid >> 4) & 0xfff) << 52))
+#define QI_DEV_EIOTLB_MAX_INVS	32
+
+/* Page group response descriptor QW0 */
+#define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
+#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
+#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
+#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
+#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
+
+/* Page group response descriptor QW1 */
+#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
+#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
+
+
+#define QI_RESP_SUCCESS		0x0
+#define QI_RESP_INVALID		0x1
+#define QI_RESP_FAILURE		0xf
+
+#define QI_GRAN_NONG_PASID		2
+#define QI_GRAN_PSI_PASID		3
+
+#define qi_shift(iommu)		(DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap))
+
+struct qi_desc {
+	u64 qw0;
+	u64 qw1;
+	u64 qw2;
+	u64 qw3;
+};
+
+struct q_inval {
+	raw_spinlock_t  q_lock;
+	void		*desc;          /* invalidation queue */
+	int             *desc_status;   /* desc status */
+	int             free_head;      /* first free entry */
+	int             free_tail;      /* last free entry */
+	int             free_cnt;
+};
+
+struct dmar_pci_notify_info;
+
+#ifdef CONFIG_IRQ_REMAP
+/* 1MB - maximum possible interrupt remapping table size */
+#define INTR_REMAP_PAGE_ORDER	8
+#define INTR_REMAP_TABLE_REG_SIZE	0xf
+#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
+
+#define INTR_REMAP_TABLE_ENTRIES	65536
+
+struct irq_domain;
+
+struct ir_table {
+	struct irte *base;
+	unsigned long *bitmap;
+};
+
+void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
+#else
+static inline void
+intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
+#endif
+
+struct iommu_flush {
+	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
+			      u8 fm, u64 type);
+	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
+			    unsigned int size_order, u64 type);
+};
+
+enum {
+	SR_DMAR_FECTL_REG,
+	SR_DMAR_FEDATA_REG,
+	SR_DMAR_FEADDR_REG,
+	SR_DMAR_FEUADDR_REG,
+	MAX_SR_DMAR_REGS
+};
+
+#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
+#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
+#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
+
+extern int intel_iommu_sm;
+extern spinlock_t device_domain_lock;
+
+#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
+#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
+				 ecap_pasid((iommu)->ecap))
+
+struct pasid_entry;
+struct pasid_state_entry;
+struct page_req_dsc;
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64     lo;
+	u64     hi;
+};
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+
+/*
+ * When VT-d works in the scalable mode, it allows DMA translation to
+ * happen through either first level or second level page table. This
+ * bit marks that the DMA translation for the domain goes through the
+ * first level page table, otherwise, it goes through the second level.
+ */
+#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
+
+struct dmar_domain {
+	int	nid;			/* node id */
+
+	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
+					/* Refcount of devices per iommu */
+
+
+	u16		iommu_did[DMAR_UNITS_SUPPORTED];
+					/* Domain ids per IOMMU. Use u16 since
+					 * domain ids are 16 bit wide according
+					 * to VT-d spec, section 9.3 */
+
+	u8 has_iotlb_device: 1;
+	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
+	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
+	u8 set_pte_snp:1;
+
+	struct list_head devices;	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+	int		flags;		/* flags to find out type of domain */
+	int		iommu_superpage;/* Level of superpages supported:
+					   0 == 4KiB (no superpages), 1 == 2MiB,
+					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
+	u64		max_addr;	/* maximum mapped address */
+
+	struct iommu_domain domain;	/* generic domain data structure for
+					   iommu core */
+};
+
+struct intel_iommu {
+	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
+	u64 		reg_phys; /* physical address of hw register set */
+	u64		reg_size; /* size of hw register set */
+	u64		cap;
+	u64		ecap;
+	u64		vccap;
+	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+	raw_spinlock_t	register_lock; /* protect register handling */
+	int		seq_id;	/* sequence id of the iommu */
+	int		agaw; /* agaw of this iommu */
+	int		msagaw; /* max sagaw of this iommu */
+	unsigned int 	irq, pr_irq;
+	u16		segment;     /* PCI segment# */
+	unsigned char 	name[13];    /* Device Name */
+
+#ifdef CONFIG_INTEL_IOMMU
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	spinlock_t	lock; /* protect context, domain ids */
+	struct root_entry *root_entry; /* virtual address */
+
+	struct iommu_flush flush;
+#endif
+#ifdef CONFIG_INTEL_IOMMU_SVM
+	struct page_req_dsc *prq;
+	unsigned char prq_name[16];    /* Name for PRQ interrupt */
+	struct completion prq_complete;
+	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
+#endif
+	struct iopf_queue *iopf_queue;
+	unsigned char iopfq_name[16];
+	struct q_inval  *qi;            /* Queued invalidation info */
+	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
+
+#ifdef CONFIG_IRQ_REMAP
+	struct ir_table *ir_table;	/* Interrupt remapping info */
+	struct irq_domain *ir_domain;
+	struct irq_domain *ir_msi_domain;
+#endif
+	struct iommu_device iommu;  /* IOMMU core code handle */
+	int		node;
+	u32		flags;      /* Software defined flags */
+
+	struct dmar_drhd_unit *drhd;
+	void *perf_statistic;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u32 segment;		/* PCI segment number */
+	u8 bus;			/* PCI bus number */
+	u8 devfn;		/* PCI devfn number */
+	u16 pfsid;		/* SRIOV physical function source ID */
+	u8 pasid_supported:3;
+	u8 pasid_enabled:1;
+	u8 pri_supported:1;
+	u8 pri_enabled:1;
+	u8 ats_supported:1;
+	u8 ats_enabled:1;
+	u8 ats_qdep;
+	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
+	struct dmar_domain *domain; /* pointer to domain */
+	struct pasid_table *pasid_table; /* pasid table */
+};
+
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
+}
+
+/* Convert generic struct iommu_domain to private struct dmar_domain */
+static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+	return container_of(dom, struct dmar_domain, domain);
+}
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-10: available
+ * 11: snoop behavior
+ * 12-63: Host physical address
+ */
+struct dma_pte {
+	u64 val;
+};
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+	pte->val = 0;
+}
+
+static inline u64 dma_pte_addr(struct dma_pte *pte)
+{
+#ifdef CONFIG_64BIT
+	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+#else
+	/* Must have a full atomic 64-bit read */
+	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
+			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
+#endif
+}
+
+static inline bool dma_pte_present(struct dma_pte *pte)
+{
+	return (pte->val & 3) != 0;
+}
+
+static inline bool dma_pte_superpage(struct dma_pte *pte)
+{
+	return (pte->val & DMA_PTE_LARGE_PAGE);
+}
+
+static inline bool first_pte_in_page(struct dma_pte *pte)
+{
+	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
+}
+
+static inline int nr_pte_to_next_page(struct dma_pte *pte)
+{
+	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
+		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
+}
+
+extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void dmar_disable_qi(struct intel_iommu *iommu);
+extern int dmar_reenable_qi(struct intel_iommu *iommu);
+extern void qi_global_iec(struct intel_iommu *iommu);
+
+extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
+			     u8 fm, u64 type);
+extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+			  unsigned int size_order, u64 type);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			u16 qdep, u64 addr, unsigned mask);
+
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
+		     unsigned long npages, bool ih);
+
+void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+			      u32 pasid, u16 qdep, u64 addr,
+			      unsigned int size_order);
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
+			  u32 pasid);
+
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+		   unsigned int count, unsigned long options);
+/*
+ * Options used in qi_submit_sync:
+ * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
+ */
+#define QI_OPT_WAIT_DRAIN		BIT(0)
+
+extern int dmar_ir_support(void);
+
+void *alloc_pgtable_page(int node);
+void free_pgtable_page(void *vaddr);
+struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
+void iommu_flush_write_buffer(struct intel_iommu *iommu);
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
+struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+extern void intel_svm_check(struct intel_iommu *iommu);
+extern int intel_svm_enable_prq(struct intel_iommu *iommu);
+extern int intel_svm_finish_prq(struct intel_iommu *iommu);
+struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
+				 void *drvdata);
+void intel_svm_unbind(struct iommu_sva *handle);
+u32 intel_svm_get_pasid(struct iommu_sva *handle);
+int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
+			    struct iommu_page_response *msg);
+
+struct intel_svm_dev {
+	struct list_head list;
+	struct rcu_head rcu;
+	struct device *dev;
+	struct intel_iommu *iommu;
+	struct iommu_sva sva;
+	unsigned long prq_seq_number;
+	u32 pasid;
+	int users;
+	u16 did;
+	u16 dev_iotlb:1;
+	u16 sid, qdep;
+};
+
+struct intel_svm {
+	struct mmu_notifier notifier;
+	struct mm_struct *mm;
+
+	unsigned int flags;
+	u32 pasid;
+	struct list_head devs;
+};
+#else
+static inline void intel_svm_check(struct intel_iommu *iommu) {}
+#endif
+
+#ifdef CONFIG_INTEL_IOMMU_DEBUGFS
+void intel_iommu_debugfs_init(void);
+#else
+static inline void intel_iommu_debugfs_init(void) {}
+#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
+
+extern const struct attribute_group *intel_iommu_groups[];
+bool context_present(struct context_entry *context);
+struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
+					 u8 devfn, int alloc);
+
+extern const struct iommu_ops intel_iommu_ops;
+
+#ifdef CONFIG_INTEL_IOMMU
+extern int iommu_calculate_agaw(struct intel_iommu *iommu);
+extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
+extern int dmar_disabled;
+extern int intel_iommu_enabled;
+#else
+static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
+static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
+{
+	return 0;
+}
+#define dmar_disabled	(1)
+#define intel_iommu_enabled (0)
+#endif
+
+static inline const char *decode_prq_descriptor(char *str, size_t size,
+		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	char *buf = str;
+	int bytes;
+
+	bytes = snprintf(buf, size,
+			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
+			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
+			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
+			 dw1 & BIT_ULL(0) ? 'r' : '-',
+			 dw1 & BIT_ULL(1) ? 'w' : '-',
+			 dw0 & BIT_ULL(52) ? 'x' : '-',
+			 dw0 & BIT_ULL(53) ? 'p' : '-',
+			 dw1 & BIT_ULL(2) ? 'l' : '-',
+			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
+			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
+
+	/* Private Data */
+	if (dw0 & BIT_ULL(9)) {
+		size -= bytes;
+		buf += bytes;
+		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
+	}
+
+	return str;
+}
+
+#endif
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index a67319597884..2e9683e970f8 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -10,7 +10,6 @@
 #include <linux/hpet.h>
 #include <linux/pci.h>
 #include <linux/irq.h>
-#include <linux/intel-iommu.h>
 #include <linux/acpi.h>
 #include <linux/irqdomain.h>
 #include <linux/crash_dump.h>
@@ -21,6 +20,7 @@
 #include <asm/irq_remapping.h>
 #include <asm/pci-direct.h>
 
+#include "iommu.h"
 #include "../irq_remapping.h"
 #include "cap_audit.h"
 
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 17cad7c1f62d..43f090381ec7 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -12,13 +12,13 @@
 #include <linux/bitops.h>
 #include <linux/cpufeature.h>
 #include <linux/dmar.h>
-#include <linux/intel-iommu.h>
 #include <linux/iommu.h>
 #include <linux/memory.h>
 #include <linux/pci.h>
 #include <linux/pci-ats.h>
 #include <linux/spinlock.h>
 
+#include "iommu.h"
 #include "pasid.h"
 
 /*
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index 0e8e03252d92..94ee70ac38e3 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/spinlock.h>
-#include <linux/intel-iommu.h>
 
+#include "iommu.h"
 #include "perf.h"
 
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 70b40d007a52..580713aa9e07 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -5,7 +5,6 @@
  * Authors: David Woodhouse <dwmw2@infradead.org>
  */
 
-#include <linux/intel-iommu.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
@@ -22,6 +21,7 @@
 #include <asm/page.h>
 #include <asm/fpu/api.h>
 
+#include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
 #include "../iommu-sva-lib.h"
diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h
index 25cb7f88e1a2..93d96f93a89b 100644
--- a/drivers/iommu/intel/trace.h
+++ b/drivers/iommu/intel/trace.h
@@ -13,7 +13,8 @@
 #define _TRACE_INTEL_IOMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/intel-iommu.h>
+
+#include "iommu.h"
 
 #define MSG_MAX		256
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
deleted file mode 100644
index 4ebf3c4da45e..000000000000
--- a/include/linux/intel-iommu.h
+++ /dev/null
@@ -1,831 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright © 2006-2015, Intel Corporation.
- *
- * Authors: Ashok Raj <ashok.raj@intel.com>
- *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *          David Woodhouse <David.Woodhouse@intel.com>
- */
-
-#ifndef _INTEL_IOMMU_H_
-#define _INTEL_IOMMU_H_
-
-#include <linux/types.h>
-#include <linux/iova.h>
-#include <linux/io.h>
-#include <linux/idr.h>
-#include <linux/mmu_notifier.h>
-#include <linux/list.h>
-#include <linux/iommu.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
-#include <linux/dmar.h>
-#include <linux/ioasid.h>
-#include <linux/bitfield.h>
-
-#include <asm/cacheflush.h>
-#include <asm/iommu.h>
-
-/*
- * VT-d hardware uses 4KiB page size regardless of host page size.
- */
-#define VTD_PAGE_SHIFT		(12)
-#define VTD_PAGE_SIZE		(1UL << VTD_PAGE_SHIFT)
-#define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
-#define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
-
-#define VTD_STRIDE_SHIFT        (9)
-#define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
-
-#define DMA_PTE_READ		BIT_ULL(0)
-#define DMA_PTE_WRITE		BIT_ULL(1)
-#define DMA_PTE_LARGE_PAGE	BIT_ULL(7)
-#define DMA_PTE_SNP		BIT_ULL(11)
-
-#define DMA_FL_PTE_PRESENT	BIT_ULL(0)
-#define DMA_FL_PTE_US		BIT_ULL(2)
-#define DMA_FL_PTE_ACCESS	BIT_ULL(5)
-#define DMA_FL_PTE_DIRTY	BIT_ULL(6)
-#define DMA_FL_PTE_XD		BIT_ULL(63)
-
-#define ADDR_WIDTH_5LEVEL	(57)
-#define ADDR_WIDTH_4LEVEL	(48)
-
-#define CONTEXT_TT_MULTI_LEVEL	0
-#define CONTEXT_TT_DEV_IOTLB	1
-#define CONTEXT_TT_PASS_THROUGH 2
-#define CONTEXT_PASIDE		BIT_ULL(3)
-
-/*
- * Intel IOMMU register specification per version 1.0 public spec.
- */
-#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
-#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
-#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
-#define	DMAR_GCMD_REG	0x18	/* Global command register */
-#define	DMAR_GSTS_REG	0x1c	/* Global status register */
-#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
-#define	DMAR_CCMD_REG	0x28	/* Context command reg */
-#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
-#define	DMAR_FECTL_REG	0x38	/* Fault control register */
-#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
-#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
-#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
-#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
-#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
-#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
-#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
-#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
-#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
-#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
-#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
-#define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
-#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
-#define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
-#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
-#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
-#define DMAR_PQH_REG	0xc0	/* Page request queue head register */
-#define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
-#define DMAR_PQA_REG	0xd0	/* Page request queue address register */
-#define DMAR_PRS_REG	0xdc	/* Page request status register */
-#define DMAR_PECTL_REG	0xe0	/* Page request event control register */
-#define	DMAR_PEDATA_REG	0xe4	/* Page request event interrupt data register */
-#define	DMAR_PEADDR_REG	0xe8	/* Page request event interrupt addr register */
-#define	DMAR_PEUADDR_REG 0xec	/* Page request event Upper address register */
-#define DMAR_MTRRCAP_REG 0x100	/* MTRR capability register */
-#define DMAR_MTRRDEF_REG 0x108	/* MTRR default type register */
-#define DMAR_MTRR_FIX64K_00000_REG 0x120 /* MTRR Fixed range registers */
-#define DMAR_MTRR_FIX16K_80000_REG 0x128
-#define DMAR_MTRR_FIX16K_A0000_REG 0x130
-#define DMAR_MTRR_FIX4K_C0000_REG 0x138
-#define DMAR_MTRR_FIX4K_C8000_REG 0x140
-#define DMAR_MTRR_FIX4K_D0000_REG 0x148
-#define DMAR_MTRR_FIX4K_D8000_REG 0x150
-#define DMAR_MTRR_FIX4K_E0000_REG 0x158
-#define DMAR_MTRR_FIX4K_E8000_REG 0x160
-#define DMAR_MTRR_FIX4K_F0000_REG 0x168
-#define DMAR_MTRR_FIX4K_F8000_REG 0x170
-#define DMAR_MTRR_PHYSBASE0_REG 0x180 /* MTRR Variable range registers */
-#define DMAR_MTRR_PHYSMASK0_REG 0x188
-#define DMAR_MTRR_PHYSBASE1_REG 0x190
-#define DMAR_MTRR_PHYSMASK1_REG 0x198
-#define DMAR_MTRR_PHYSBASE2_REG 0x1a0
-#define DMAR_MTRR_PHYSMASK2_REG 0x1a8
-#define DMAR_MTRR_PHYSBASE3_REG 0x1b0
-#define DMAR_MTRR_PHYSMASK3_REG 0x1b8
-#define DMAR_MTRR_PHYSBASE4_REG 0x1c0
-#define DMAR_MTRR_PHYSMASK4_REG 0x1c8
-#define DMAR_MTRR_PHYSBASE5_REG 0x1d0
-#define DMAR_MTRR_PHYSMASK5_REG 0x1d8
-#define DMAR_MTRR_PHYSBASE6_REG 0x1e0
-#define DMAR_MTRR_PHYSMASK6_REG 0x1e8
-#define DMAR_MTRR_PHYSBASE7_REG 0x1f0
-#define DMAR_MTRR_PHYSMASK7_REG 0x1f8
-#define DMAR_MTRR_PHYSBASE8_REG 0x200
-#define DMAR_MTRR_PHYSMASK8_REG 0x208
-#define DMAR_MTRR_PHYSBASE9_REG 0x210
-#define DMAR_MTRR_PHYSMASK9_REG 0x218
-#define DMAR_VCCAP_REG		0xe30 /* Virtual command capability register */
-#define DMAR_VCMD_REG		0xe00 /* Virtual command register */
-#define DMAR_VCRSP_REG		0xe10 /* Virtual command response register */
-
-#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
-#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
-#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
-
-#define OFFSET_STRIDE		(9)
-
-#define dmar_readq(a) readq(a)
-#define dmar_writeq(a,v) writeq(v,a)
-#define dmar_readl(a) readl(a)
-#define dmar_writel(a, v) writel(v, a)
-
-#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
-#define DMAR_VER_MINOR(v)		((v) & 0x0f)
-
-/*
- * Decoding Capability Register
- */
-#define cap_5lp_support(c)	(((c) >> 60) & 1)
-#define cap_pi_support(c)	(((c) >> 59) & 1)
-#define cap_fl1gp_support(c)	(((c) >> 56) & 1)
-#define cap_read_drain(c)	(((c) >> 55) & 1)
-#define cap_write_drain(c)	(((c) >> 54) & 1)
-#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
-#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
-#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
-
-#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
-#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
-					* OFFSET_STRIDE) + 21)
-
-#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
-#define cap_max_fault_reg_offset(c) \
-	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
-
-#define cap_zlr(c)		(((c) >> 22) & 1)
-#define cap_isoch(c)		(((c) >> 23) & 1)
-#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
-#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
-#define cap_caching_mode(c)	(((c) >> 7) & 1)
-#define cap_phmr(c)		(((c) >> 6) & 1)
-#define cap_plmr(c)		(((c) >> 5) & 1)
-#define cap_rwbf(c)		(((c) >> 4) & 1)
-#define cap_afl(c)		(((c) >> 3) & 1)
-#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
-/*
- * Extended Capability Register
- */
-
-#define	ecap_rps(e)		(((e) >> 49) & 0x1)
-#define ecap_smpwc(e)		(((e) >> 48) & 0x1)
-#define ecap_flts(e)		(((e) >> 47) & 0x1)
-#define ecap_slts(e)		(((e) >> 46) & 0x1)
-#define ecap_slads(e)		(((e) >> 45) & 0x1)
-#define ecap_vcs(e)		(((e) >> 44) & 0x1)
-#define ecap_smts(e)		(((e) >> 43) & 0x1)
-#define ecap_dit(e)		(((e) >> 41) & 0x1)
-#define ecap_pds(e)		(((e) >> 42) & 0x1)
-#define ecap_pasid(e)		(((e) >> 40) & 0x1)
-#define ecap_pss(e)		(((e) >> 35) & 0x1f)
-#define ecap_eafs(e)		(((e) >> 34) & 0x1)
-#define ecap_nwfs(e)		(((e) >> 33) & 0x1)
-#define ecap_srs(e)		(((e) >> 31) & 0x1)
-#define ecap_ers(e)		(((e) >> 30) & 0x1)
-#define ecap_prs(e)		(((e) >> 29) & 0x1)
-#define ecap_broken_pasid(e)	(((e) >> 28) & 0x1)
-#define ecap_dis(e)		(((e) >> 27) & 0x1)
-#define ecap_nest(e)		(((e) >> 26) & 0x1)
-#define ecap_mts(e)		(((e) >> 25) & 0x1)
-#define ecap_ecs(e)		(((e) >> 24) & 0x1)
-#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
-#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16)
-#define ecap_coherent(e)	((e) & 0x1)
-#define ecap_qis(e)		((e) & 0x2)
-#define ecap_pass_through(e)	(((e) >> 6) & 0x1)
-#define ecap_eim_support(e)	(((e) >> 4) & 0x1)
-#define ecap_ir_support(e)	(((e) >> 3) & 0x1)
-#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
-#define ecap_max_handle_mask(e) (((e) >> 20) & 0xf)
-#define ecap_sc_support(e)	(((e) >> 7) & 0x1) /* Snooping Control */
-
-/* Virtual command interface capability */
-#define vccap_pasid(v)		(((v) & DMA_VCS_PAS)) /* PASID allocation */
-
-/* IOTLB_REG */
-#define DMA_TLB_FLUSH_GRANU_OFFSET  60
-#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
-#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
-#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
-#define DMA_TLB_IIRG(type) ((type >> 60) & 3)
-#define DMA_TLB_IAIG(val) (((val) >> 57) & 3)
-#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
-#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
-#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
-#define DMA_TLB_IVT (((u64)1) << 63)
-#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
-#define DMA_TLB_MAX_SIZE (0x3f)
-
-/* INVALID_DESC */
-#define DMA_CCMD_INVL_GRANU_OFFSET  61
-#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 4)
-#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 4)
-#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 4)
-#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
-#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
-#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
-#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
-#define DMA_ID_TLB_ADDR(addr)	(addr)
-#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
-
-/* PMEN_REG */
-#define DMA_PMEN_EPM (((u32)1)<<31)
-#define DMA_PMEN_PRS (((u32)1)<<0)
-
-/* GCMD_REG */
-#define DMA_GCMD_TE (((u32)1) << 31)
-#define DMA_GCMD_SRTP (((u32)1) << 30)
-#define DMA_GCMD_SFL (((u32)1) << 29)
-#define DMA_GCMD_EAFL (((u32)1) << 28)
-#define DMA_GCMD_WBF (((u32)1) << 27)
-#define DMA_GCMD_QIE (((u32)1) << 26)
-#define DMA_GCMD_SIRTP (((u32)1) << 24)
-#define DMA_GCMD_IRE (((u32) 1) << 25)
-#define DMA_GCMD_CFI (((u32) 1) << 23)
-
-/* GSTS_REG */
-#define DMA_GSTS_TES (((u32)1) << 31)
-#define DMA_GSTS_RTPS (((u32)1) << 30)
-#define DMA_GSTS_FLS (((u32)1) << 29)
-#define DMA_GSTS_AFLS (((u32)1) << 28)
-#define DMA_GSTS_WBFS (((u32)1) << 27)
-#define DMA_GSTS_QIES (((u32)1) << 26)
-#define DMA_GSTS_IRTPS (((u32)1) << 24)
-#define DMA_GSTS_IRES (((u32)1) << 25)
-#define DMA_GSTS_CFIS (((u32)1) << 23)
-
-/* DMA_RTADDR_REG */
-#define DMA_RTADDR_RTT (((u64)1) << 11)
-#define DMA_RTADDR_SMT (((u64)1) << 10)
-
-/* CCMD_REG */
-#define DMA_CCMD_ICC (((u64)1) << 63)
-#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
-#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
-#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
-#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
-#define DMA_CCMD_MASK_NOBIT 0
-#define DMA_CCMD_MASK_1BIT 1
-#define DMA_CCMD_MASK_2BIT 2
-#define DMA_CCMD_MASK_3BIT 3
-#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
-#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
-
-/* FECTL_REG */
-#define DMA_FECTL_IM (((u32)1) << 31)
-
-/* FSTS_REG */
-#define DMA_FSTS_PFO (1 << 0) /* Primary Fault Overflow */
-#define DMA_FSTS_PPF (1 << 1) /* Primary Pending Fault */
-#define DMA_FSTS_IQE (1 << 4) /* Invalidation Queue Error */
-#define DMA_FSTS_ICE (1 << 5) /* Invalidation Completion Error */
-#define DMA_FSTS_ITE (1 << 6) /* Invalidation Time-out Error */
-#define DMA_FSTS_PRO (1 << 7) /* Page Request Overflow */
-#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
-
-/* FRCD_REG, 32 bits access */
-#define DMA_FRCD_F (((u32)1) << 31)
-#define dma_frcd_type(d) ((d >> 30) & 1)
-#define dma_frcd_fault_reason(c) (c & 0xff)
-#define dma_frcd_source_id(c) (c & 0xffff)
-#define dma_frcd_pasid_value(c) (((c) >> 8) & 0xfffff)
-#define dma_frcd_pasid_present(c) (((c) >> 31) & 1)
-/* low 64 bit */
-#define dma_frcd_page_addr(d) (d & (((u64)-1) << PAGE_SHIFT))
-
-/* PRS_REG */
-#define DMA_PRS_PPR	((u32)1)
-#define DMA_PRS_PRO	((u32)2)
-
-#define DMA_VCS_PAS	((u64)1)
-
-#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts)			\
-do {									\
-	cycles_t start_time = get_cycles();				\
-	while (1) {							\
-		sts = op(iommu->reg + offset);				\
-		if (cond)						\
-			break;						\
-		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
-			panic("DMAR hardware is malfunctioning\n");	\
-		cpu_relax();						\
-	}								\
-} while (0)
-
-#define QI_LENGTH	256	/* queue length */
-
-enum {
-	QI_FREE,
-	QI_IN_USE,
-	QI_DONE,
-	QI_ABORT
-};
-
-#define QI_CC_TYPE		0x1
-#define QI_IOTLB_TYPE		0x2
-#define QI_DIOTLB_TYPE		0x3
-#define QI_IEC_TYPE		0x4
-#define QI_IWD_TYPE		0x5
-#define QI_EIOTLB_TYPE		0x6
-#define QI_PC_TYPE		0x7
-#define QI_DEIOTLB_TYPE		0x8
-#define QI_PGRP_RESP_TYPE	0x9
-#define QI_PSTRM_RESP_TYPE	0xa
-
-#define QI_IEC_SELECTIVE	(((u64)1) << 4)
-#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
-#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
-
-#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
-#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
-#define QI_IWD_FENCE		(((u64)1) << 6)
-#define QI_IWD_PRQ_DRAIN	(((u64)1) << 7)
-
-#define QI_IOTLB_DID(did) 	(((u64)did) << 16)
-#define QI_IOTLB_DR(dr) 	(((u64)dr) << 7)
-#define QI_IOTLB_DW(dw) 	(((u64)dw) << 6)
-#define QI_IOTLB_GRAN(gran) 	(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
-#define QI_IOTLB_ADDR(addr)	(((u64)addr) & VTD_PAGE_MASK)
-#define QI_IOTLB_IH(ih)		(((u64)ih) << 6)
-#define QI_IOTLB_AM(am)		(((u8)am) & 0x3f)
-
-#define QI_CC_FM(fm)		(((u64)fm) << 48)
-#define QI_CC_SID(sid)		(((u64)sid) << 32)
-#define QI_CC_DID(did)		(((u64)did) << 16)
-#define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
-
-#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
-#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
-#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
-#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
-				   ((u64)((pfsid >> 4) & 0xfff) << 52))
-#define QI_DEV_IOTLB_SIZE	1
-#define QI_DEV_IOTLB_MAX_INVS	32
-
-#define QI_PC_PASID(pasid)	(((u64)pasid) << 32)
-#define QI_PC_DID(did)		(((u64)did) << 16)
-#define QI_PC_GRAN(gran)	(((u64)gran) << 4)
-
-/* PASID cache invalidation granu */
-#define QI_PC_ALL_PASIDS	0
-#define QI_PC_PASID_SEL		1
-#define QI_PC_GLOBAL		3
-
-#define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
-#define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-#define QI_EIOTLB_AM(am)	(((u64)am) & 0x3f)
-#define QI_EIOTLB_PASID(pasid) 	(((u64)pasid) << 32)
-#define QI_EIOTLB_DID(did)	(((u64)did) << 16)
-#define QI_EIOTLB_GRAN(gran) 	(((u64)gran) << 4)
-
-/* QI Dev-IOTLB inv granu */
-#define QI_DEV_IOTLB_GRAN_ALL		1
-#define QI_DEV_IOTLB_GRAN_PASID_SEL	0
-
-#define QI_DEV_EIOTLB_ADDR(a)	((u64)(a) & VTD_PAGE_MASK)
-#define QI_DEV_EIOTLB_SIZE	(((u64)1) << 11)
-#define QI_DEV_EIOTLB_PASID(p)	((u64)((p) & 0xfffff) << 32)
-#define QI_DEV_EIOTLB_SID(sid)	((u64)((sid) & 0xffff) << 16)
-#define QI_DEV_EIOTLB_QDEP(qd)	((u64)((qd) & 0x1f) << 4)
-#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \
-				    ((u64)((pfsid >> 4) & 0xfff) << 52))
-#define QI_DEV_EIOTLB_MAX_INVS	32
-
-/* Page group response descriptor QW0 */
-#define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
-#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
-#define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
-#define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
-#define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
-
-/* Page group response descriptor QW1 */
-#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
-#define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
-
-
-#define QI_RESP_SUCCESS		0x0
-#define QI_RESP_INVALID		0x1
-#define QI_RESP_FAILURE		0xf
-
-#define QI_GRAN_NONG_PASID		2
-#define QI_GRAN_PSI_PASID		3
-
-#define qi_shift(iommu)		(DMAR_IQ_SHIFT + !!ecap_smts((iommu)->ecap))
-
-struct qi_desc {
-	u64 qw0;
-	u64 qw1;
-	u64 qw2;
-	u64 qw3;
-};
-
-struct q_inval {
-	raw_spinlock_t  q_lock;
-	void		*desc;          /* invalidation queue */
-	int             *desc_status;   /* desc status */
-	int             free_head;      /* first free entry */
-	int             free_tail;      /* last free entry */
-	int             free_cnt;
-};
-
-struct dmar_pci_notify_info;
-
-#ifdef CONFIG_IRQ_REMAP
-/* 1MB - maximum possible interrupt remapping table size */
-#define INTR_REMAP_PAGE_ORDER	8
-#define INTR_REMAP_TABLE_REG_SIZE	0xf
-#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
-
-#define INTR_REMAP_TABLE_ENTRIES	65536
-
-struct irq_domain;
-
-struct ir_table {
-	struct irte *base;
-	unsigned long *bitmap;
-};
-
-void intel_irq_remap_add_device(struct dmar_pci_notify_info *info);
-#else
-static inline void
-intel_irq_remap_add_device(struct dmar_pci_notify_info *info) { }
-#endif
-
-struct iommu_flush {
-	void (*flush_context)(struct intel_iommu *iommu, u16 did, u16 sid,
-			      u8 fm, u64 type);
-	void (*flush_iotlb)(struct intel_iommu *iommu, u16 did, u64 addr,
-			    unsigned int size_order, u64 type);
-};
-
-enum {
-	SR_DMAR_FECTL_REG,
-	SR_DMAR_FEDATA_REG,
-	SR_DMAR_FEADDR_REG,
-	SR_DMAR_FEUADDR_REG,
-	MAX_SR_DMAR_REGS
-};
-
-#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
-#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
-#define VTD_FLAG_SVM_CAPABLE		(1 << 2)
-
-extern int intel_iommu_sm;
-extern spinlock_t device_domain_lock;
-
-#define sm_supported(iommu)	(intel_iommu_sm && ecap_smts((iommu)->ecap))
-#define pasid_supported(iommu)	(sm_supported(iommu) &&			\
-				 ecap_pasid((iommu)->ecap))
-
-struct pasid_entry;
-struct pasid_state_entry;
-struct page_req_dsc;
-
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64     lo;
-	u64     hi;
-};
-
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-
-/*
- * When VT-d works in the scalable mode, it allows DMA translation to
- * happen through either first level or second level page table. This
- * bit marks that the DMA translation for the domain goes through the
- * first level page table, otherwise, it goes through the second level.
- */
-#define DOMAIN_FLAG_USE_FIRST_LEVEL		BIT(1)
-
-struct dmar_domain {
-	int	nid;			/* node id */
-
-	unsigned int iommu_refcnt[DMAR_UNITS_SUPPORTED];
-					/* Refcount of devices per iommu */
-
-
-	u16		iommu_did[DMAR_UNITS_SUPPORTED];
-					/* Domain ids per IOMMU. Use u16 since
-					 * domain ids are 16 bit wide according
-					 * to VT-d spec, section 9.3 */
-
-	u8 has_iotlb_device: 1;
-	u8 iommu_coherency: 1;		/* indicate coherency of iommu access */
-	u8 force_snooping : 1;		/* Create IOPTEs with snoop control */
-	u8 set_pte_snp:1;
-
-	struct list_head devices;	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-	int		flags;		/* flags to find out type of domain */
-	int		iommu_superpage;/* Level of superpages supported:
-					   0 == 4KiB (no superpages), 1 == 2MiB,
-					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-	u64		max_addr;	/* maximum mapped address */
-
-	struct iommu_domain domain;	/* generic domain data structure for
-					   iommu core */
-};
-
-struct intel_iommu {
-	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
-	u64 		reg_phys; /* physical address of hw register set */
-	u64		reg_size; /* size of hw register set */
-	u64		cap;
-	u64		ecap;
-	u64		vccap;
-	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
-	raw_spinlock_t	register_lock; /* protect register handling */
-	int		seq_id;	/* sequence id of the iommu */
-	int		agaw; /* agaw of this iommu */
-	int		msagaw; /* max sagaw of this iommu */
-	unsigned int 	irq, pr_irq;
-	u16		segment;     /* PCI segment# */
-	unsigned char 	name[13];    /* Device Name */
-
-#ifdef CONFIG_INTEL_IOMMU
-	unsigned long 	*domain_ids; /* bitmap of domains */
-	spinlock_t	lock; /* protect context, domain ids */
-	struct root_entry *root_entry; /* virtual address */
-
-	struct iommu_flush flush;
-#endif
-#ifdef CONFIG_INTEL_IOMMU_SVM
-	struct page_req_dsc *prq;
-	unsigned char prq_name[16];    /* Name for PRQ interrupt */
-	struct completion prq_complete;
-	struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */
-#endif
-	struct iopf_queue *iopf_queue;
-	unsigned char iopfq_name[16];
-	struct q_inval  *qi;            /* Queued invalidation info */
-	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
-
-#ifdef CONFIG_IRQ_REMAP
-	struct ir_table *ir_table;	/* Interrupt remapping info */
-	struct irq_domain *ir_domain;
-	struct irq_domain *ir_msi_domain;
-#endif
-	struct iommu_device iommu;  /* IOMMU core code handle */
-	int		node;
-	u32		flags;      /* Software defined flags */
-
-	struct dmar_drhd_unit *drhd;
-	void *perf_statistic;
-};
-
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	u32 segment;		/* PCI segment number */
-	u8 bus;			/* PCI bus number */
-	u8 devfn;		/* PCI devfn number */
-	u16 pfsid;		/* SRIOV physical function source ID */
-	u8 pasid_supported:3;
-	u8 pasid_enabled:1;
-	u8 pri_supported:1;
-	u8 pri_enabled:1;
-	u8 ats_supported:1;
-	u8 ats_enabled:1;
-	u8 ats_qdep;
-	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
-	struct intel_iommu *iommu; /* IOMMU used by this device */
-	struct dmar_domain *domain; /* pointer to domain */
-	struct pasid_table *pasid_table; /* pasid table */
-};
-
-static inline void __iommu_flush_cache(
-	struct intel_iommu *iommu, void *addr, int size)
-{
-	if (!ecap_coherent(iommu->ecap))
-		clflush_cache_range(addr, size);
-}
-
-/* Convert generic struct iommu_domain to private struct dmar_domain */
-static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
-{
-	return container_of(dom, struct dmar_domain, domain);
-}
-
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-10: available
- * 11: snoop behavior
- * 12-63: Host physical address
- */
-struct dma_pte {
-	u64 val;
-};
-
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
-	pte->val = 0;
-}
-
-static inline u64 dma_pte_addr(struct dma_pte *pte)
-{
-#ifdef CONFIG_64BIT
-	return pte->val & VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
-#else
-	/* Must have a full atomic 64-bit read */
-	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) &
-			VTD_PAGE_MASK & (~DMA_FL_PTE_XD);
-#endif
-}
-
-static inline bool dma_pte_present(struct dma_pte *pte)
-{
-	return (pte->val & 3) != 0;
-}
-
-static inline bool dma_pte_superpage(struct dma_pte *pte)
-{
-	return (pte->val & DMA_PTE_LARGE_PAGE);
-}
-
-static inline bool first_pte_in_page(struct dma_pte *pte)
-{
-	return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE);
-}
-
-static inline int nr_pte_to_next_page(struct dma_pte *pte)
-{
-	return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) :
-		(struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte;
-}
-
-extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
-
-extern int dmar_enable_qi(struct intel_iommu *iommu);
-extern void dmar_disable_qi(struct intel_iommu *iommu);
-extern int dmar_reenable_qi(struct intel_iommu *iommu);
-extern void qi_global_iec(struct intel_iommu *iommu);
-
-extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
-			     u8 fm, u64 type);
-extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
-			  unsigned int size_order, u64 type);
-extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-			u16 qdep, u64 addr, unsigned mask);
-
-void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr,
-		     unsigned long npages, bool ih);
-
-void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid,
-			      u32 pasid, u16 qdep, u64 addr,
-			      unsigned int size_order);
-void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu,
-			  u32 pasid);
-
-int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
-		   unsigned int count, unsigned long options);
-/*
- * Options used in qi_submit_sync:
- * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8.
- */
-#define QI_OPT_WAIT_DRAIN		BIT(0)
-
-extern int dmar_ir_support(void);
-
-void *alloc_pgtable_page(int node);
-void free_pgtable_page(void *vaddr);
-struct intel_iommu *domain_get_iommu(struct dmar_domain *domain);
-void iommu_flush_write_buffer(struct intel_iommu *iommu);
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
-struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
-
-#ifdef CONFIG_INTEL_IOMMU_SVM
-extern void intel_svm_check(struct intel_iommu *iommu);
-extern int intel_svm_enable_prq(struct intel_iommu *iommu);
-extern int intel_svm_finish_prq(struct intel_iommu *iommu);
-struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm,
-				 void *drvdata);
-void intel_svm_unbind(struct iommu_sva *handle);
-u32 intel_svm_get_pasid(struct iommu_sva *handle);
-int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
-			    struct iommu_page_response *msg);
-
-struct intel_svm_dev {
-	struct list_head list;
-	struct rcu_head rcu;
-	struct device *dev;
-	struct intel_iommu *iommu;
-	struct iommu_sva sva;
-	unsigned long prq_seq_number;
-	u32 pasid;
-	int users;
-	u16 did;
-	u16 dev_iotlb:1;
-	u16 sid, qdep;
-};
-
-struct intel_svm {
-	struct mmu_notifier notifier;
-	struct mm_struct *mm;
-
-	unsigned int flags;
-	u32 pasid;
-	struct list_head devs;
-};
-#else
-static inline void intel_svm_check(struct intel_iommu *iommu) {}
-#endif
-
-#ifdef CONFIG_INTEL_IOMMU_DEBUGFS
-void intel_iommu_debugfs_init(void);
-#else
-static inline void intel_iommu_debugfs_init(void) {}
-#endif /* CONFIG_INTEL_IOMMU_DEBUGFS */
-
-extern const struct attribute_group *intel_iommu_groups[];
-bool context_present(struct context_entry *context);
-struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
-					 u8 devfn, int alloc);
-
-extern const struct iommu_ops intel_iommu_ops;
-
-#ifdef CONFIG_INTEL_IOMMU
-extern int iommu_calculate_agaw(struct intel_iommu *iommu);
-extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
-extern int dmar_disabled;
-extern int intel_iommu_enabled;
-#else
-static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
-{
-	return 0;
-}
-static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
-{
-	return 0;
-}
-#define dmar_disabled	(1)
-#define intel_iommu_enabled (0)
-#endif
-
-static inline const char *decode_prq_descriptor(char *str, size_t size,
-		u64 dw0, u64 dw1, u64 dw2, u64 dw3)
-{
-	char *buf = str;
-	int bytes;
-
-	bytes = snprintf(buf, size,
-			 "rid=0x%llx addr=0x%llx %c%c%c%c%c pasid=0x%llx index=0x%llx",
-			 FIELD_GET(GENMASK_ULL(31, 16), dw0),
-			 FIELD_GET(GENMASK_ULL(63, 12), dw1),
-			 dw1 & BIT_ULL(0) ? 'r' : '-',
-			 dw1 & BIT_ULL(1) ? 'w' : '-',
-			 dw0 & BIT_ULL(52) ? 'x' : '-',
-			 dw0 & BIT_ULL(53) ? 'p' : '-',
-			 dw1 & BIT_ULL(2) ? 'l' : '-',
-			 FIELD_GET(GENMASK_ULL(51, 32), dw0),
-			 FIELD_GET(GENMASK_ULL(11, 3), dw1));
-
-	/* Private Data */
-	if (dw0 & BIT_ULL(9)) {
-		size -= bytes;
-		buf += bytes;
-		snprintf(buf, size, " private=0x%llx/0x%llx\n", dw2, dw3);
-	}
-
-	return str;
-}
-
-#endif
-- 
cgit v1.2.3


From 25357900f4e67094dd09b7dcb8a5f47c3f5a159d Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 12 Jul 2022 08:09:08 +0800
Subject: iommu/vt-d: Make DMAR_UNITS_SUPPORTED default 1024

If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set
to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to
allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ
remapping doesn't support X2APIC mode x2apic disabled"; and the system
fails to boot properly.

To support up to 64 sockets with 10 DMAR units each (640), make the
value of DMAR_UNITS_SUPPORTED default 1024.

Signed-off-by: Steve Wahl<steve.wahl@hpe.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Steve Wahl <steve.wahl@hpe.com>
Link: https://lore.kernel.org/linux-iommu/20220615183650.32075-1-steve.wahl@hpe.com/
Link: https://lore.kernel.org/r/20220702015610.2849494-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/dmar.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index cbd714a198a0..d81a51978d01 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -18,11 +18,7 @@
 
 struct acpi_dmar_header;
 
-#ifdef	CONFIG_X86
-# define	DMAR_UNITS_SUPPORTED	MAX_IO_APICS
-#else
-# define	DMAR_UNITS_SUPPORTED	64
-#endif
+#define DMAR_UNITS_SUPPORTED	1024
 
 /* DMAR Flags */
 #define DMAR_INTR_REMAP		0x1
-- 
cgit v1.2.3


From fb2accadaa9427a374fa9f482ea24ca731f675ba Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Wed, 13 Jul 2022 17:56:48 -0500
Subject: iommu/amd: Introduce function to check and enable SNP

To support SNP, IOMMU needs to be enabled, and prohibits IOMMU
configurations where DTE[Mode]=0, which means it cannot be supported with
IOMMU passthrough domain (a.k.a IOMMU_DOMAIN_IDENTITY),
and when AMD IOMMU driver is configured to not use the IOMMU host (v1) page
table. Otherwise, RMP table initialization could cause the system to crash.

The request to enable SNP support in IOMMU must be done before PCI
initialization state of the IOMMU driver because enabling SNP affects
how IOMMU driver sets up IOMMU data structures (i.e. DTE).

Unlike other IOMMU features, SNP feature does not have an enable bit in
the IOMMU control register. Instead, the IOMMU driver introduces
an amd_iommu_snp_en variable to track enabling state of SNP.

Introduce amd_iommu_snp_enable() for other drivers to request enabling
the SNP support in IOMMU, which checks all prerequisites and determines
if the feature can be safely enabled.

Please see the IOMMU spec section 2.12 for further details.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Co-developed-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Link: https://lore.kernel.org/r/20220713225651.20758-7-suravee.suthikulpanit@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu.h |  2 ++
 drivers/iommu/amd/init.c      | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/amd-iommu.h     |  4 ++++
 3 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 1b945d47741c..84e5bb1bf01b 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -140,4 +140,6 @@ extern struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
 
 extern u64 amd_iommu_efr;
 extern u64 amd_iommu_efr2;
+
+extern bool amd_iommu_snp_en;
 #endif
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 7c6e61451ff7..cddcb6f7e3c9 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -168,6 +168,10 @@ static int amd_iommu_target_ivhd_type;
 u64 amd_iommu_efr;
 u64 amd_iommu_efr2;
 
+/* SNP is enabled on the system? */
+bool amd_iommu_snp_en;
+EXPORT_SYMBOL(amd_iommu_snp_en);
+
 LIST_HEAD(amd_iommu_pci_seg_list);	/* list of all PCI segments */
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
 					   system */
@@ -3556,3 +3560,41 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64
 
 	return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
 }
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+int amd_iommu_snp_enable(void)
+{
+	/*
+	 * The SNP support requires that IOMMU must be enabled, and is
+	 * not configured in the passthrough mode.
+	 */
+	if (no_iommu || iommu_default_passthrough()) {
+		pr_err("SNP: IOMMU is disabled or configured in passthrough mode, SNP cannot be supported");
+		return -EINVAL;
+	}
+
+	/*
+	 * Prevent enabling SNP after IOMMU_ENABLED state because this process
+	 * affect how IOMMU driver sets up data structures and configures
+	 * IOMMU hardware.
+	 */
+	if (init_state > IOMMU_ENABLED) {
+		pr_err("SNP: Too late to enable SNP for IOMMU.\n");
+		return -EINVAL;
+	}
+
+	amd_iommu_snp_en = check_feature_on_all_iommus(FEATURE_SNP);
+	if (!amd_iommu_snp_en)
+		return -EINVAL;
+
+	pr_info("SNP enabled\n");
+
+	/* Enforce IOMMU v1 pagetable when SNP is enabled. */
+	if (amd_iommu_pgtable != AMD_IOMMU_V1) {
+		pr_warn("Force to using AMD IOMMU v1 page table due to SNP\n");
+		amd_iommu_pgtable = AMD_IOMMU_V1;
+	}
+
+	return 0;
+}
+#endif
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 58e6c3806c09..953e6f12fa1c 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -206,4 +206,8 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
 		u64 *value);
 struct amd_iommu *get_amd_iommu(unsigned int idx);
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+int amd_iommu_snp_enable(void);
+#endif
+
 #endif /* _ASM_X86_AMD_IOMMU_H */
-- 
cgit v1.2.3


From 93064e15c8a3a8394319a11b8037666e4b7d653d Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 7 Jul 2022 16:10:36 +0100
Subject: ACPI: utils: Add api to read _SUB from ACPI

Add a wrapper function to read the _SUB string from ACPI.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220707151037.3901050-2-sbinding@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/acpi/utils.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h |  6 ++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 3a9773a09e19..5a7b8065e77f 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -291,6 +291,44 @@ int acpi_get_local_address(acpi_handle handle, u32 *addr)
 }
 EXPORT_SYMBOL(acpi_get_local_address);
 
+#define ACPI_MAX_SUB_BUF_SIZE	9
+
+const char *acpi_get_subsystem_id(acpi_handle handle)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	acpi_status status;
+	const char *sub;
+	size_t len;
+
+	status = acpi_evaluate_object(handle, METHOD_NAME__SUB, NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		acpi_handle_debug(handle, "Reading ACPI _SUB failed: %#x\n", status);
+		return ERR_PTR(-ENODATA);
+	}
+
+	obj = buffer.pointer;
+	if (obj->type == ACPI_TYPE_STRING) {
+		len = strlen(obj->string.pointer);
+		if (len < ACPI_MAX_SUB_BUF_SIZE && len > 0) {
+			sub = kstrdup(obj->string.pointer, GFP_KERNEL);
+			if (!sub)
+				sub = ERR_PTR(-ENOMEM);
+		} else {
+			acpi_handle_err(handle, "ACPI _SUB Length %zu is Invalid\n", len);
+			sub = ERR_PTR(-ENODATA);
+		}
+	} else {
+		acpi_handle_warn(handle, "Warning ACPI _SUB did not return a string\n");
+		sub = ERR_PTR(-ENODATA);
+	}
+
+	acpi_os_free(buffer.pointer);
+
+	return sub;
+}
+EXPORT_SYMBOL_GPL(acpi_get_subsystem_id);
+
 acpi_status
 acpi_evaluate_reference(acpi_handle handle,
 			acpi_string pathname,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4f82a5bc6d98..968a187f14f0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -762,6 +762,7 @@ static inline u64 acpi_arch_get_root_pointer(void)
 #endif
 
 int acpi_get_local_address(acpi_handle handle, u32 *addr);
+const char *acpi_get_subsystem_id(acpi_handle handle);
 
 #else	/* !CONFIG_ACPI */
 
@@ -1023,6 +1024,11 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr)
 	return -ENODEV;
 }
 
+static inline const char *acpi_get_subsystem_id(acpi_handle handle)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline int acpi_register_wakeup_handler(int wake_irq,
 	bool (*wakeup)(void *context), void *context)
 {
-- 
cgit v1.2.3


From 4dea97f8636d0514befc9fc5cf342b351b7d0e20 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:25 -0700
Subject: lib/bitmap: change type of bitmap_weight to unsigned long

bitmap_weight() doesn't return negative values, so change it's type
to unsigned long. It may help compiler to generate better code and
catch bugs.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 drivers/dma/ti/k3-udma.c                           | 6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c            | 2 +-
 drivers/gpu/drm/i915/display/intel_display_power.c | 2 +-
 drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c           | 2 +-
 drivers/iommu/intel/iommu.c                        | 2 +-
 drivers/net/ethernet/mellanox/mlx4/fw.c            | 2 +-
 drivers/net/wireless/ath/ath9k/htc_drv_debug.c     | 2 +-
 drivers/net/wireless/ath/carl9170/debug.c          | 2 +-
 include/linux/bitmap.h                             | 5 +++--
 lib/bitmap.c                                       | 5 ++---
 tools/include/linux/bitmap.h                       | 4 ++--
 tools/lib/bitmap.c                                 | 4 ++--
 12 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 2f0d2c68c93c..07cb48db76ba 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -4997,7 +4997,7 @@ static int setup_resources(struct udma_dev *ud)
 	switch (ud->match_data->type) {
 	case DMA_TYPE_UDMA:
 		dev_info(dev,
-			 "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n",
+			 "Channels: %d (tchan: %lu, rchan: %lu, gp-rflow: %lu)\n",
 			 ch_count,
 			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
 						       ud->tchan_cnt),
@@ -5008,7 +5008,7 @@ static int setup_resources(struct udma_dev *ud)
 		break;
 	case DMA_TYPE_BCDMA:
 		dev_info(dev,
-			 "Channels: %d (bchan: %u, tchan: %u, rchan: %u)\n",
+			 "Channels: %d (bchan: %lu, tchan: %lu, rchan: %lu)\n",
 			 ch_count,
 			 ud->bchan_cnt - bitmap_weight(ud->bchan_map,
 						       ud->bchan_cnt),
@@ -5019,7 +5019,7 @@ static int setup_resources(struct udma_dev *ud)
 		break;
 	case DMA_TYPE_PKTDMA:
 		dev_info(dev,
-			 "Channels: %d (tchan: %u, rchan: %u)\n",
+			 "Channels: %d (tchan: %lu, rchan: %lu)\n",
 			 ch_count,
 			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
 						       ud->tchan_cnt),
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 16699158e00d..1098506ba1aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -195,7 +195,7 @@ void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)
 			set_bit(i, adev->gfx.mec.queue_bitmap);
 	}
 
-	dev_dbg(adev->dev, "mec queue bitmap weight=%d\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES));
+	dev_dbg(adev->dev, "mec queue bitmap weight=%ld\n", bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES));
 }
 
 void amdgpu_gfx_graphics_queue_acquire(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c
index 949edc983a16..4204e8a0e1a6 100644
--- a/drivers/gpu/drm/i915/display/intel_display_power.c
+++ b/drivers/gpu/drm/i915/display/intel_display_power.c
@@ -378,7 +378,7 @@ static void print_power_domains(struct i915_power_domains *power_domains,
 						     power_domains);
 	enum intel_display_power_domain domain;
 
-	drm_dbg(&i915->drm, "%s (%d):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM));
+	drm_dbg(&i915->drm, "%s (%ld):\n", prefix, bitmap_weight(mask->bits, POWER_DOMAIN_NUM));
 	for_each_power_domain(domain, mask)
 		drm_dbg(&i915->drm, "%s use_count %d\n",
 			intel_display_power_domain_str(domain),
diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
index 56a3063545ec..3d35b3982a60 100644
--- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_smp.c
@@ -363,7 +363,7 @@ void mdp5_smp_dump(struct mdp5_smp *smp, struct drm_printer *p)
 	}
 
 	drm_printf(p, "TOTAL:\t%d\t(of %d)\n", total, smp->blk_cnt);
-	drm_printf(p, "AVAIL:\t%d\n", smp->blk_cnt -
+	drm_printf(p, "AVAIL:\t%ld\n", smp->blk_cnt -
 			bitmap_weight(state->state, smp->blk_cnt));
 
 	if (drm_can_sleep())
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 44016594831d..2f1b09f34706 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3946,7 +3946,7 @@ static ssize_t domains_used_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
-	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+	return sprintf(buf, "%ld\n", bitmap_weight(iommu->domain_ids,
 						  cap_ndoms(iommu->cap)));
 }
 static DEVICE_ATTR_RO(domains_used);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 42c96c9d7fb1..af054a3808ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -463,7 +463,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 		field = min(
 			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
-			dev->caps.num_ports);
+			(unsigned long)dev->caps.num_ports);
 		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
 
 		size = dev->caps.function_caps; /* set PF behaviours */
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
index b3ed65e5c4da..d32056c4ee1d 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_debug.c
@@ -296,7 +296,7 @@ static ssize_t read_file_slot(struct file *file, char __user *user_buf,
 	spin_lock_bh(&priv->tx.tx_lock);
 	len = scnprintf(buf, sizeof(buf),
 			"TX slot bitmap : %*pb\n"
-			"Used slots     : %d\n",
+			"Used slots     : %ld\n",
 			MAX_TX_BUF_NUM, priv->tx.tx_slot,
 			bitmap_weight(priv->tx.tx_slot, MAX_TX_BUF_NUM));
 	spin_unlock_bh(&priv->tx.tx_lock);
diff --git a/drivers/net/wireless/ath/carl9170/debug.c b/drivers/net/wireless/ath/carl9170/debug.c
index bb40889d7c72..7e88882b87d9 100644
--- a/drivers/net/wireless/ath/carl9170/debug.c
+++ b/drivers/net/wireless/ath/carl9170/debug.c
@@ -221,7 +221,7 @@ static char *carl9170_debugfs_mem_usage_read(struct ar9170 *ar, char *buf,
 	ADD(buf, *len, bufsize, "jar: [%*pb]\n",
 	    ar->fw.mem_blocks, ar->mem_bitmap);
 
-	ADD(buf, *len, bufsize, "cookies: used:%3d / total:%3d, allocs:%d\n",
+	ADD(buf, *len, bufsize, "cookies: used:%3ld / total:%3d, allocs:%d\n",
 	    bitmap_weight(ar->mem_bitmap, ar->fw.mem_blocks),
 	    ar->fw.mem_blocks, atomic_read(&ar->mem_allocs));
 
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index e1a438bdda52..035d4ac66641 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -163,7 +163,7 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 			 const unsigned long *bitmap2, unsigned int nbits);
 bool __bitmap_subset(const unsigned long *bitmap1,
 		     const unsigned long *bitmap2, unsigned int nbits);
-int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
 void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 
@@ -431,7 +431,8 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 	return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline
+unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 9bc80f5bf149..2b67cd657692 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -333,10 +333,9 @@ bool __bitmap_subset(const unsigned long *bitmap1,
 }
 EXPORT_SYMBOL(__bitmap_subset);
 
-int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
-	unsigned int k, lim = bits/BITS_PER_LONG;
-	int w = 0;
+	unsigned long k, w = 0, lim = bits/BITS_PER_LONG;
 
 	for (k = 0; k < lim; k++)
 		w += hweight_long(bitmap[k]);
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 2ae7ab8ed7d1..ae1852e39142 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -11,7 +11,7 @@
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
 
-int __bitmap_weight(const unsigned long *bitmap, int bits);
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
 bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
@@ -61,7 +61,7 @@ static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
 	return find_first_zero_bit(src, nbits) == nbits;
 }
 
-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static inline unsigned long bitmap_weight(const unsigned long *src, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 2e351d63fdba..e1fafc131a49 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -5,9 +5,9 @@
  */
 #include <linux/bitmap.h>
 
-int __bitmap_weight(const unsigned long *bitmap, int bits)
+unsigned long __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
-	int k, w = 0, lim = bits/BITS_PER_LONG;
+	unsigned long k, w = 0, lim = bits/BITS_PER_LONG;
 
 	for (k = 0; k < lim; k++)
 		w += hweight_long(bitmap[k]);
-- 
cgit v1.2.3


From cb32c285cc10e428589194e30233d673e7c23c72 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:26 -0700
Subject: cpumask: change return types to bool where appropriate

Some cpumask functions have integer return types where return values
are naturally booleans.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..b54e27d9da6b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -372,9 +372,9 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in @cpumask, else returns 0
+ * Returns true if @cpu is set in @cpumask, else returns false
  */
-static __always_inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
+static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
 {
 	return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
 }
@@ -384,11 +384,11 @@ static __always_inline int cpumask_test_cpu(int cpu, const struct cpumask *cpuma
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
+ * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
  *
  * test_and_set_bit wrapper for cpumasks.
  */
-static __always_inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -398,11 +398,11 @@ static __always_inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpu
  * @cpu: cpu number (< nr_cpu_ids)
  * @cpumask: the cpumask pointer
  *
- * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
+ * Returns true if @cpu is set in old bitmap of @cpumask, else returns false
  *
  * test_and_clear_bit wrapper for cpumasks.
  */
-static __always_inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
+static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
 {
 	return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
 }
@@ -431,9 +431,9 @@ static inline void cpumask_clear(struct cpumask *dstp)
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns 0, else returns 1
+ * If *@dstp is empty, returns false, else returns true
  */
-static inline int cpumask_and(struct cpumask *dstp,
+static inline bool cpumask_and(struct cpumask *dstp,
 			       const struct cpumask *src1p,
 			       const struct cpumask *src2p)
 {
@@ -474,9 +474,9 @@ static inline void cpumask_xor(struct cpumask *dstp,
  * @src1p: the first input
  * @src2p: the second input
  *
- * If *@dstp is empty, returns 0, else returns 1
+ * If *@dstp is empty, returns false, else returns true
  */
-static inline int cpumask_andnot(struct cpumask *dstp,
+static inline bool cpumask_andnot(struct cpumask *dstp,
 				  const struct cpumask *src1p,
 				  const struct cpumask *src2p)
 {
@@ -539,9 +539,9 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
  * @src1p: the first input
  * @src2p: the second input
  *
- * Returns 1 if *@src1p is a subset of *@src2p, else returns 0
+ * Returns true if *@src1p is a subset of *@src2p, else returns false
  */
-static inline int cpumask_subset(const struct cpumask *src1p,
+static inline bool cpumask_subset(const struct cpumask *src1p,
 				 const struct cpumask *src2p)
 {
 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
-- 
cgit v1.2.3


From 8b6b795d9bfc031a8953c40fac8d3cf67e1a4d3d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:27 -0700
Subject: lib/cpumask: change return types to unsigned where appropriate

Switch return types to unsigned int where return values cannot be negative.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 14 +++++++-------
 lib/cpumask.c           | 18 +++++++++---------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b54e27d9da6b..760022bcb925 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -176,12 +176,12 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 					     const struct cpumask *src2p) {
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
+static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
@@ -258,12 +258,12 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+unsigned int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+unsigned int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
-int cpumask_any_and_distribute(const struct cpumask *src1p,
+unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
-int cpumask_any_distribute(const struct cpumask *srcp);
+unsigned int cpumask_any_distribute(const struct cpumask *srcp);
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
@@ -289,7 +289,7 @@ int cpumask_any_distribute(const struct cpumask *srcp);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a971a82d2f43..da68f6bbde44 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL(cpumask_next);
  *
  * Returns >= nr_cpu_ids if no further cpus set in both.
  */
-int cpumask_next_and(int n, const struct cpumask *src1p,
+unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		     const struct cpumask *src2p)
 {
 	/* -1 is a legal arg here. */
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(cpumask_next_and);
  * Often used to find any cpu but smp_processor_id() in a mask.
  * Returns >= nr_cpu_ids if no cpus set.
  */
-int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 {
 	unsigned int i;
 
@@ -74,9 +74,9 @@ EXPORT_SYMBOL(cpumask_any_but);
  * Note: the @wrap argument is required for the start condition when
  * we cannot assume @start is set in @mask.
  */
-int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
 {
-	int next;
+	unsigned int next;
 
 again:
 	next = cpumask_next(n, mask);
@@ -205,7 +205,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
  */
 unsigned int cpumask_local_spread(unsigned int i, int node)
 {
-	int cpu;
+	unsigned int cpu;
 
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
@@ -243,10 +243,10 @@ static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
  *
  * Returns >= nr_cpu_ids if the intersection is empty.
  */
-int cpumask_any_and_distribute(const struct cpumask *src1p,
+unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p)
 {
-	int next, prev;
+	unsigned int next, prev;
 
 	/* NOTE: our first selection will skip 0. */
 	prev = __this_cpu_read(distribute_cpu_mask_prev);
@@ -262,9 +262,9 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
 }
 EXPORT_SYMBOL(cpumask_any_and_distribute);
 
-int cpumask_any_distribute(const struct cpumask *srcp)
+unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
-	int next, prev;
+	unsigned int next, prev;
 
 	/* NOTE: our first selection will skip 0. */
 	prev = __this_cpu_read(distribute_cpu_mask_prev);
-- 
cgit v1.2.3


From 9b2e70860ef2f0d74b6d9e57929d57b14481b9c9 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:28 -0700
Subject: lib/cpumask: move trivial wrappers around find_bit to the header

To avoid circular dependencies, cpumask keeps simple (almost) one-line
wrappers around find_bit() in a c-file.

Commit 47d8c15615c0a2 ("include: move find.h from asm_generic to linux")
moved find.h header out of asm_generic include path, and it helped to fix
many circular dependencies, including some in cpumask.h.

This patch moves those one-liners to header files.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++---
 lib/cpumask.c           | 55 -----------------------------------------------
 2 files changed, 54 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 760022bcb925..ea3de2c2c180 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -241,7 +241,21 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
-unsigned int __pure cpumask_next(int n, const struct cpumask *srcp);
+/**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+static inline
+unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+}
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
@@ -258,8 +272,25 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-unsigned int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-unsigned int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+/**
+ * cpumask_next_and - get the next cpu in *src1p & *src2p
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @src1p: the first cpumask pointer
+ * @src2p: the second cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set in both.
+ */
+static inline
+unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
+		     const struct cpumask *src2p)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
+		nr_cpumask_bits, n + 1);
+}
+
 unsigned int cpumask_local_spread(unsigned int i, int node);
 unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
@@ -324,6 +355,26 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, boo
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
+
+/**
+ * cpumask_any_but - return a "random" in a cpumask, but not this one.
+ * @mask: the cpumask to search
+ * @cpu: the cpu to ignore.
+ *
+ * Often used to find any cpu but smp_processor_id() in a mask.
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+static inline
+unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
+{
+	unsigned int i;
+
+	cpumask_check(cpu);
+	for_each_cpu(i, mask)
+		if (i != cpu)
+			break;
+	return i;
+}
 #endif /* SMP */
 
 #define CPU_BITS_NONE						\
diff --git a/lib/cpumask.c b/lib/cpumask.c
index da68f6bbde44..cb7262ff8633 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -7,61 +7,6 @@
 #include <linux/memblock.h>
 #include <linux/numa.h>
 
-/**
- * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @srcp: the cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set.
- */
-unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
-	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
-}
-EXPORT_SYMBOL(cpumask_next);
-
-/**
- * cpumask_next_and - get the next cpu in *src1p & *src2p
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @src1p: the first cpumask pointer
- * @src2p: the second cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set in both.
- */
-unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
-		     const struct cpumask *src2p)
-{
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
-	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
-		nr_cpumask_bits, n + 1);
-}
-EXPORT_SYMBOL(cpumask_next_and);
-
-/**
- * cpumask_any_but - return a "random" in a cpumask, but not this one.
- * @mask: the cpumask to search
- * @cpu: the cpu to ignore.
- *
- * Often used to find any cpu but smp_processor_id() in a mask.
- * Returns >= nr_cpu_ids if no cpus set.
- */
-unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
-{
-	unsigned int i;
-
-	cpumask_check(cpu);
-	for_each_cpu(i, mask)
-		if (i != cpu)
-			break;
-	return i;
-}
-EXPORT_SYMBOL(cpumask_any_but);
-
 /**
  * cpumask_next_wrap - helper to implement for_each_cpu_wrap
  * @n: the cpu prior to the place to search
-- 
cgit v1.2.3


From db96b0c5f9db22d908ab5f7cd75904adba4b28ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 10 Jun 2021 10:56:49 +0200
Subject: headers/deps: mm: Optimize <linux/gfp.h> header dependencies

There's a couple of superfluous inclusions here - remove them before
doing bigger changes.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/gfp.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae933c2..52f2c873a7d4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -2,10 +2,7 @@
 #ifndef __LINUX_GFP_H
 #define __LINUX_GFP_H
 
-#include <linux/mmdebug.h>
 #include <linux/mmzone.h>
-#include <linux/stddef.h>
-#include <linux/linkage.h>
 #include <linux/topology.h>
 
 /* The typedef is in types.h but we want the documentation here */
-- 
cgit v1.2.3


From cb5a065b4ea9c062a18143c8a14e831179687f54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 14 Apr 2022 18:42:28 +0200
Subject: headers/deps: mm: Split <linux/gfp_types.h> out of <linux/gfp.h>

This is a much smaller header.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/gfp.h       | 345 +--------------------------------------------
 include/linux/gfp_types.h | 348 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 350 insertions(+), 343 deletions(-)
 create mode 100644 include/linux/gfp_types.h

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 52f2c873a7d4..f314be58fa77 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -2,354 +2,13 @@
 #ifndef __LINUX_GFP_H
 #define __LINUX_GFP_H
 
+#include <linux/gfp_types.h>
+
 #include <linux/mmzone.h>
 #include <linux/topology.h>
 
-/* The typedef is in types.h but we want the documentation here */
-#if 0
-/**
- * typedef gfp_t - Memory allocation flags.
- *
- * GFP flags are commonly used throughout Linux to indicate how memory
- * should be allocated.  The GFP acronym stands for get_free_pages(),
- * the underlying memory allocation function.  Not every GFP flag is
- * supported by every function which may allocate memory.  Most users
- * will want to use a plain ``GFP_KERNEL``.
- */
-typedef unsigned int __bitwise gfp_t;
-#endif
-
 struct vm_area_struct;
 
-/*
- * In case of changes, please don't forget to update
- * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
- */
-
-/* Plain integer GFP bitmasks. Do not use this directly. */
-#define ___GFP_DMA		0x01u
-#define ___GFP_HIGHMEM		0x02u
-#define ___GFP_DMA32		0x04u
-#define ___GFP_MOVABLE		0x08u
-#define ___GFP_RECLAIMABLE	0x10u
-#define ___GFP_HIGH		0x20u
-#define ___GFP_IO		0x40u
-#define ___GFP_FS		0x80u
-#define ___GFP_ZERO		0x100u
-#define ___GFP_ATOMIC		0x200u
-#define ___GFP_DIRECT_RECLAIM	0x400u
-#define ___GFP_KSWAPD_RECLAIM	0x800u
-#define ___GFP_WRITE		0x1000u
-#define ___GFP_NOWARN		0x2000u
-#define ___GFP_RETRY_MAYFAIL	0x4000u
-#define ___GFP_NOFAIL		0x8000u
-#define ___GFP_NORETRY		0x10000u
-#define ___GFP_MEMALLOC		0x20000u
-#define ___GFP_COMP		0x40000u
-#define ___GFP_NOMEMALLOC	0x80000u
-#define ___GFP_HARDWALL		0x100000u
-#define ___GFP_THISNODE		0x200000u
-#define ___GFP_ACCOUNT		0x400000u
-#define ___GFP_ZEROTAGS		0x800000u
-#ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO		0x1000000u
-#define ___GFP_SKIP_KASAN_UNPOISON	0x2000000u
-#define ___GFP_SKIP_KASAN_POISON	0x4000000u
-#else
-#define ___GFP_SKIP_ZERO		0
-#define ___GFP_SKIP_KASAN_UNPOISON	0
-#define ___GFP_SKIP_KASAN_POISON	0
-#endif
-#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x8000000u
-#else
-#define ___GFP_NOLOCKDEP	0
-#endif
-/* If the above are modified, __GFP_BITS_SHIFT may need updating */
-
-/*
- * Physical address zone modifiers (see linux/mmzone.h - low four bits)
- *
- * Do not put any conditional on these. If necessary modify the definitions
- * without the underscores and use them consistently. The definitions here may
- * be used in bit comparisons.
- */
-#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
-#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
-#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
-#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
-#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
-
-/**
- * DOC: Page mobility and placement hints
- *
- * Page mobility and placement hints
- * ---------------------------------
- *
- * These flags provide hints about how mobile the page is. Pages with similar
- * mobility are placed within the same pageblocks to minimise problems due
- * to external fragmentation.
- *
- * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
- * moved by page migration during memory compaction or can be reclaimed.
- *
- * %__GFP_RECLAIMABLE is used for slab allocations that specify
- * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
- *
- * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
- * these pages will be spread between local zones to avoid all the dirty
- * pages being in one zone (fair zone allocation policy).
- *
- * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
- *
- * %__GFP_THISNODE forces the allocation to be satisfied from the requested
- * node with no fallbacks or placement policy enforcements.
- *
- * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
- */
-#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
-#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
-#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
-#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
-#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
-
-/**
- * DOC: Watermark modifiers
- *
- * Watermark modifiers -- controls access to emergency reserves
- * ------------------------------------------------------------
- *
- * %__GFP_HIGH indicates that the caller is high-priority and that granting
- * the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
- *
- * %__GFP_MEMALLOC allows access to all memory. This should only be used when
- * the caller guarantees the allocation will allow more memory to be freed
- * very shortly e.g. process exiting or swapping. Users either should
- * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
- * Users of this flag have to be extremely careful to not deplete the reserve
- * completely and implement a throttling mechanism which controls the
- * consumption of the reserve based on the amount of freed memory.
- * Usage of a pre-allocated pool (e.g. mempool) should be always considered
- * before using this flag.
- *
- * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
- * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
- */
-#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
-#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
-#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
-#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
-
-/**
- * DOC: Reclaim modifiers
- *
- * Reclaim modifiers
- * -----------------
- * Please note that all the following flags are only applicable to sleepable
- * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
- *
- * %__GFP_IO can start physical IO.
- *
- * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
- * allocator recursing into the filesystem which might already be holding
- * locks.
- *
- * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
- * This flag can be cleared to avoid unnecessary delays when a fallback
- * option is available.
- *
- * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
- * the low watermark is reached and have it reclaim pages until the high
- * watermark is reached. A caller may wish to clear this flag when fallback
- * options are available and the reclaim is likely to disrupt the system. The
- * canonical example is THP allocation where a fallback is cheap but
- * reclaim/compaction may cause indirect stalls.
- *
- * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
- *
- * The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
- * !costly allocations are too essential to fail so they are implicitly
- * non-failing by default (with some exceptions like OOM victims might fail so
- * the caller still has to check for failures) while costly requests try to be
- * not disruptive and back off even without invoking the OOM killer.
- * The following three modifiers might be used to override some of these
- * implicit rules
- *
- * %__GFP_NORETRY: The VM implementation will try only very lightweight
- * memory direct reclaim to get some memory under memory pressure (thus
- * it can sleep). It will avoid disruptive actions like OOM killer. The
- * caller must handle the failure which is quite likely to happen under
- * heavy memory pressure. The flag is suitable when failure can easily be
- * handled at small cost, such as reduced throughput
- *
- * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
- * procedures that have previously failed if there is some indication
- * that progress has been made else where.  It can wait for other
- * tasks to attempt high level approaches to freeing memory such as
- * compaction (which removes fragmentation) and page-out.
- * There is still a definite limit to the number of retries, but it is
- * a larger limit than with %__GFP_NORETRY.
- * Allocations with this flag may fail, but only when there is
- * genuinely little unused memory. While these allocations do not
- * directly trigger the OOM killer, their failure indicates that
- * the system is likely to need to use the OOM killer soon.  The
- * caller must handle failure, but can reasonably do so by failing
- * a higher-level request, or completing it only in a much less
- * efficient manner.
- * If the allocation does fail, and the caller is in a position to
- * free some non-essential memory, doing so could benefit the system
- * as a whole.
- *
- * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures. The allocation could block
- * indefinitely but will never return with failure. Testing for
- * failure is pointless.
- * New users should be evaluated carefully (and the flag should be
- * used only when there is no reasonable failure policy) but it is
- * definitely preferable to use the flag rather than opencode endless
- * loop around allocator.
- * Using this flag for costly allocations is _highly_ discouraged.
- */
-#define __GFP_IO	((__force gfp_t)___GFP_IO)
-#define __GFP_FS	((__force gfp_t)___GFP_FS)
-#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
-#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
-#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
-#define __GFP_RETRY_MAYFAIL	((__force gfp_t)___GFP_RETRY_MAYFAIL)
-#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
-#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
-
-/**
- * DOC: Action modifiers
- *
- * Action modifiers
- * ----------------
- *
- * %__GFP_NOWARN suppresses allocation failure reports.
- *
- * %__GFP_COMP address compound page metadata.
- *
- * %__GFP_ZERO returns a zeroed page on success.
- *
- * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
- * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
- * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
- * memory tags at the same time as zeroing memory has minimal additional
- * performace impact.
- *
- * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
- * Only effective in HW_TAGS mode.
- *
- * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
- * Typically, used for userspace pages. Only effective in HW_TAGS mode.
- */
-#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
-#define __GFP_COMP	((__force gfp_t)___GFP_COMP)
-#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
-#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
-#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
-#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
-#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
-
-/* Disable lockdep for GFP context tracking */
-#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
-
-/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
-#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-
-/**
- * DOC: Useful GFP flag combinations
- *
- * Useful GFP flag combinations
- * ----------------------------
- *
- * Useful GFP flag combinations that are commonly used. It is recommended
- * that subsystems start with one of these combinations and then set/clear
- * %__GFP_FOO flags as necessary.
- *
- * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
- * watermark is applied to allow access to "atomic reserves".
- * The current implementation doesn't support NMI and few other strict
- * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
- *
- * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
- * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
- *
- * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
- * accounted to kmemcg.
- *
- * %GFP_NOWAIT is for kernel allocations that should not stall for direct
- * reclaim, start physical IO or use any filesystem callback.
- *
- * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
- * that do not require the starting of any physical IO.
- * Please try to avoid using this flag directly and instead use
- * memalloc_noio_{save,restore} to mark the whole scope which cannot
- * perform any IO with a short explanation why. All allocation requests
- * will inherit GFP_NOIO implicitly.
- *
- * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
- * Please try to avoid using this flag directly and instead use
- * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
- * recurse into the FS layer with a short explanation why. All allocation
- * requests will inherit GFP_NOFS implicitly.
- *
- * %GFP_USER is for userspace allocations that also need to be directly
- * accessibly by the kernel or hardware. It is typically used by hardware
- * for buffers that are mapped to userspace (e.g. graphics) that hardware
- * still must DMA to. cpuset limits are enforced for these allocations.
- *
- * %GFP_DMA exists for historical reasons and should be avoided where possible.
- * The flags indicates that the caller requires that the lowest zone be
- * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
- * it would require careful auditing as some users really require it and
- * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
- * lowest zone as a type of emergency reserve.
- *
- * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
- * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
- * because the DMA32 kmalloc cache array is not implemented.
- * (Reason: there is no such user in kernel).
- *
- * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
- * do not need to be directly accessible by the kernel but that cannot
- * move once in use. An example may be a hardware allocation that maps
- * data directly into userspace but has no addressing limitations.
- *
- * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
- * need direct access to but can use kmap() when access is required. They
- * are expected to be movable via page reclaim or page migration. Typically,
- * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
- *
- * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
- * are compound allocations that will generally fail quickly if memory is not
- * available and will not wake kswapd/kcompactd on failure. The _LIGHT
- * version does not attempt reclaim/compaction at all and is by default used
- * in page fault path, while the non-light is used by khugepaged.
- */
-#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
-#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
-#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
-#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
-#define GFP_NOIO	(__GFP_RECLAIM)
-#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
-#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
-#define GFP_DMA		__GFP_DMA
-#define GFP_DMA32	__GFP_DMA32
-#define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
-			 __GFP_SKIP_KASAN_POISON)
-#define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
-#define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
-
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
 #define GFP_MOVABLE_SHIFT 3
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
new file mode 100644
index 000000000000..06fc85cee23f
--- /dev/null
+++ b/include/linux/gfp_types.h
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_GFP_TYPES_H
+#define __LINUX_GFP_TYPES_H
+
+/* The typedef is in types.h but we want the documentation here */
+#if 0
+/**
+ * typedef gfp_t - Memory allocation flags.
+ *
+ * GFP flags are commonly used throughout Linux to indicate how memory
+ * should be allocated.  The GFP acronym stands for get_free_pages(),
+ * the underlying memory allocation function.  Not every GFP flag is
+ * supported by every function which may allocate memory.  Most users
+ * will want to use a plain ``GFP_KERNEL``.
+ */
+typedef unsigned int __bitwise gfp_t;
+#endif
+
+/*
+ * In case of changes, please don't forget to update
+ * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
+ */
+
+/* Plain integer GFP bitmasks. Do not use this directly. */
+#define ___GFP_DMA		0x01u
+#define ___GFP_HIGHMEM		0x02u
+#define ___GFP_DMA32		0x04u
+#define ___GFP_MOVABLE		0x08u
+#define ___GFP_RECLAIMABLE	0x10u
+#define ___GFP_HIGH		0x20u
+#define ___GFP_IO		0x40u
+#define ___GFP_FS		0x80u
+#define ___GFP_ZERO		0x100u
+#define ___GFP_ATOMIC		0x200u
+#define ___GFP_DIRECT_RECLAIM	0x400u
+#define ___GFP_KSWAPD_RECLAIM	0x800u
+#define ___GFP_WRITE		0x1000u
+#define ___GFP_NOWARN		0x2000u
+#define ___GFP_RETRY_MAYFAIL	0x4000u
+#define ___GFP_NOFAIL		0x8000u
+#define ___GFP_NORETRY		0x10000u
+#define ___GFP_MEMALLOC		0x20000u
+#define ___GFP_COMP		0x40000u
+#define ___GFP_NOMEMALLOC	0x80000u
+#define ___GFP_HARDWALL		0x100000u
+#define ___GFP_THISNODE		0x200000u
+#define ___GFP_ACCOUNT		0x400000u
+#define ___GFP_ZEROTAGS		0x800000u
+#ifdef CONFIG_KASAN_HW_TAGS
+#define ___GFP_SKIP_ZERO		0x1000000u
+#define ___GFP_SKIP_KASAN_UNPOISON	0x2000000u
+#define ___GFP_SKIP_KASAN_POISON	0x4000000u
+#else
+#define ___GFP_SKIP_ZERO		0
+#define ___GFP_SKIP_KASAN_UNPOISON	0
+#define ___GFP_SKIP_KASAN_POISON	0
+#endif
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP	0x8000000u
+#else
+#define ___GFP_NOLOCKDEP	0
+#endif
+/* If the above are modified, __GFP_BITS_SHIFT may need updating */
+
+/*
+ * Physical address zone modifiers (see linux/mmzone.h - low four bits)
+ *
+ * Do not put any conditional on these. If necessary modify the definitions
+ * without the underscores and use them consistently. The definitions here may
+ * be used in bit comparisons.
+ */
+#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
+#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
+#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
+#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
+#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
+
+/**
+ * DOC: Page mobility and placement hints
+ *
+ * Page mobility and placement hints
+ * ---------------------------------
+ *
+ * These flags provide hints about how mobile the page is. Pages with similar
+ * mobility are placed within the same pageblocks to minimise problems due
+ * to external fragmentation.
+ *
+ * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
+ * moved by page migration during memory compaction or can be reclaimed.
+ *
+ * %__GFP_RECLAIMABLE is used for slab allocations that specify
+ * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
+ *
+ * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
+ * these pages will be spread between local zones to avoid all the dirty
+ * pages being in one zone (fair zone allocation policy).
+ *
+ * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
+ *
+ * %__GFP_THISNODE forces the allocation to be satisfied from the requested
+ * node with no fallbacks or placement policy enforcements.
+ *
+ * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
+#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
+#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
+#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
+#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
+
+/**
+ * DOC: Watermark modifiers
+ *
+ * Watermark modifiers -- controls access to emergency reserves
+ * ------------------------------------------------------------
+ *
+ * %__GFP_HIGH indicates that the caller is high-priority and that granting
+ * the request is necessary before the system can make forward progress.
+ * For example, creating an IO context to clean pages.
+ *
+ * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
+ * high priority. Users are typically interrupt handlers. This may be
+ * used in conjunction with %__GFP_HIGH
+ *
+ * %__GFP_MEMALLOC allows access to all memory. This should only be used when
+ * the caller guarantees the allocation will allow more memory to be freed
+ * very shortly e.g. process exiting or swapping. Users either should
+ * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ * Users of this flag have to be extremely careful to not deplete the reserve
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory.
+ * Usage of a pre-allocated pool (e.g. mempool) should be always considered
+ * before using this flag.
+ *
+ * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
+ * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
+ */
+#define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
+#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
+#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
+
+/**
+ * DOC: Reclaim modifiers
+ *
+ * Reclaim modifiers
+ * -----------------
+ * Please note that all the following flags are only applicable to sleepable
+ * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
+ *
+ * %__GFP_IO can start physical IO.
+ *
+ * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
+ * allocator recursing into the filesystem which might already be holding
+ * locks.
+ *
+ * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
+ * This flag can be cleared to avoid unnecessary delays when a fallback
+ * option is available.
+ *
+ * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
+ * the low watermark is reached and have it reclaim pages until the high
+ * watermark is reached. A caller may wish to clear this flag when fallback
+ * options are available and the reclaim is likely to disrupt the system. The
+ * canonical example is THP allocation where a fallback is cheap but
+ * reclaim/compaction may cause indirect stalls.
+ *
+ * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
+ *
+ * The default allocator behavior depends on the request size. We have a concept
+ * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
+ * !costly allocations are too essential to fail so they are implicitly
+ * non-failing by default (with some exceptions like OOM victims might fail so
+ * the caller still has to check for failures) while costly requests try to be
+ * not disruptive and back off even without invoking the OOM killer.
+ * The following three modifiers might be used to override some of these
+ * implicit rules
+ *
+ * %__GFP_NORETRY: The VM implementation will try only very lightweight
+ * memory direct reclaim to get some memory under memory pressure (thus
+ * it can sleep). It will avoid disruptive actions like OOM killer. The
+ * caller must handle the failure which is quite likely to happen under
+ * heavy memory pressure. The flag is suitable when failure can easily be
+ * handled at small cost, such as reduced throughput
+ *
+ * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
+ * procedures that have previously failed if there is some indication
+ * that progress has been made else where.  It can wait for other
+ * tasks to attempt high level approaches to freeing memory such as
+ * compaction (which removes fragmentation) and page-out.
+ * There is still a definite limit to the number of retries, but it is
+ * a larger limit than with %__GFP_NORETRY.
+ * Allocations with this flag may fail, but only when there is
+ * genuinely little unused memory. While these allocations do not
+ * directly trigger the OOM killer, their failure indicates that
+ * the system is likely to need to use the OOM killer soon.  The
+ * caller must handle failure, but can reasonably do so by failing
+ * a higher-level request, or completing it only in a much less
+ * efficient manner.
+ * If the allocation does fail, and the caller is in a position to
+ * free some non-essential memory, doing so could benefit the system
+ * as a whole.
+ *
+ * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
+ * cannot handle allocation failures. The allocation could block
+ * indefinitely but will never return with failure. Testing for
+ * failure is pointless.
+ * New users should be evaluated carefully (and the flag should be
+ * used only when there is no reasonable failure policy) but it is
+ * definitely preferable to use the flag rather than opencode endless
+ * loop around allocator.
+ * Using this flag for costly allocations is _highly_ discouraged.
+ */
+#define __GFP_IO	((__force gfp_t)___GFP_IO)
+#define __GFP_FS	((__force gfp_t)___GFP_FS)
+#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
+#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
+#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
+#define __GFP_RETRY_MAYFAIL	((__force gfp_t)___GFP_RETRY_MAYFAIL)
+#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
+#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
+
+/**
+ * DOC: Action modifiers
+ *
+ * Action modifiers
+ * ----------------
+ *
+ * %__GFP_NOWARN suppresses allocation failure reports.
+ *
+ * %__GFP_COMP address compound page metadata.
+ *
+ * %__GFP_ZERO returns a zeroed page on success.
+ *
+ * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
+ * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
+ * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
+ * memory tags at the same time as zeroing memory has minimal additional
+ * performace impact.
+ *
+ * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
+ * Only effective in HW_TAGS mode.
+ *
+ * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
+ * Typically, used for userspace pages. Only effective in HW_TAGS mode.
+ */
+#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
+#define __GFP_COMP	((__force gfp_t)___GFP_COMP)
+#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
+#define __GFP_ZEROTAGS	((__force gfp_t)___GFP_ZEROTAGS)
+#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
+#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
+#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
+/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+
+/**
+ * DOC: Useful GFP flag combinations
+ *
+ * Useful GFP flag combinations
+ * ----------------------------
+ *
+ * Useful GFP flag combinations that are commonly used. It is recommended
+ * that subsystems start with one of these combinations and then set/clear
+ * %__GFP_FOO flags as necessary.
+ *
+ * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
+ * watermark is applied to allow access to "atomic reserves".
+ * The current implementation doesn't support NMI and few other strict
+ * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
+ *
+ * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
+ * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
+ *
+ * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
+ * accounted to kmemcg.
+ *
+ * %GFP_NOWAIT is for kernel allocations that should not stall for direct
+ * reclaim, start physical IO or use any filesystem callback.
+ *
+ * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
+ * that do not require the starting of any physical IO.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_noio_{save,restore} to mark the whole scope which cannot
+ * perform any IO with a short explanation why. All allocation requests
+ * will inherit GFP_NOIO implicitly.
+ *
+ * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ * recurse into the FS layer with a short explanation why. All allocation
+ * requests will inherit GFP_NOFS implicitly.
+ *
+ * %GFP_USER is for userspace allocations that also need to be directly
+ * accessibly by the kernel or hardware. It is typically used by hardware
+ * for buffers that are mapped to userspace (e.g. graphics) that hardware
+ * still must DMA to. cpuset limits are enforced for these allocations.
+ *
+ * %GFP_DMA exists for historical reasons and should be avoided where possible.
+ * The flags indicates that the caller requires that the lowest zone be
+ * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
+ * it would require careful auditing as some users really require it and
+ * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
+ * lowest zone as a type of emergency reserve.
+ *
+ * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
+ * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
+ * because the DMA32 kmalloc cache array is not implemented.
+ * (Reason: there is no such user in kernel).
+ *
+ * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
+ * do not need to be directly accessible by the kernel but that cannot
+ * move once in use. An example may be a hardware allocation that maps
+ * data directly into userspace but has no addressing limitations.
+ *
+ * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
+ * need direct access to but can use kmap() when access is required. They
+ * are expected to be movable via page reclaim or page migration. Typically,
+ * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
+ *
+ * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
+ * are compound allocations that will generally fail quickly if memory is not
+ * available and will not wake kswapd/kcompactd on failure. The _LIGHT
+ * version does not attempt reclaim/compaction at all and is by default used
+ * in page fault path, while the non-light is used by khugepaged.
+ */
+#define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
+#define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
+#define GFP_NOIO	(__GFP_RECLAIM)
+#define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
+#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_DMA		__GFP_DMA
+#define GFP_DMA32	__GFP_DMA32
+#define GFP_HIGHUSER	(GFP_USER | __GFP_HIGHMEM)
+#define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE | \
+			 __GFP_SKIP_KASAN_POISON)
+#define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
+
+#endif /* __LINUX_GFP_TYPES_H */
-- 
cgit v1.2.3


From f0dd891dd5a1d6dc6c9d486333aac4f433f17d17 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 1 Jul 2022 05:54:30 -0700
Subject: lib/cpumask: move some one-line wrappers to header file

After moving gfp flags to a separate header, it's possible to move some
cpumask allocators into headers, and avoid creating real functions.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 34 +++++++++++++++++++++++++++++++---
 lib/cpumask.c           | 28 ----------------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ea3de2c2c180..80627362c774 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -12,6 +12,8 @@
 #include <linux/bitmap.h>
 #include <linux/atomic.h>
 #include <linux/bug.h>
+#include <linux/gfp_types.h>
+#include <linux/numa.h>
 
 /* Don't assign or return these: may not be this big! */
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
@@ -794,9 +796,35 @@ typedef struct cpumask *cpumask_var_t;
 #define __cpumask_var_read_mostly	__read_mostly
 
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
-bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
-bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
+
+static inline
+bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
+{
+	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
+}
+
+/**
+ * alloc_cpumask_var - allocate a struct cpumask
+ * @mask: pointer to cpumask_var_t where the cpumask is returned
+ * @flags: GFP_ flags
+ *
+ * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
+ * a nop returning a constant 1 (in <linux/cpumask.h>).
+ *
+ * See alloc_cpumask_var_node.
+ */
+static inline
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
+}
+
+static inline
+bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
+}
+
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
diff --git a/lib/cpumask.c b/lib/cpumask.c
index cb7262ff8633..f0ae119be8c4 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -70,34 +70,6 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 }
 EXPORT_SYMBOL(alloc_cpumask_var_node);
 
-bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
-{
-	return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
-}
-EXPORT_SYMBOL(zalloc_cpumask_var_node);
-
-/**
- * alloc_cpumask_var - allocate a struct cpumask
- * @mask: pointer to cpumask_var_t where the cpumask is returned
- * @flags: GFP_ flags
- *
- * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
- * a nop returning a constant 1 (in <linux/cpumask.h>).
- *
- * See alloc_cpumask_var_node.
- */
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-{
-	return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(alloc_cpumask_var);
-
-bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
-{
-	return alloc_cpumask_var(mask, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(zalloc_cpumask_var);
-
 /**
  * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
  * @mask: pointer to cpumask_var_t where the cpumask is returned
-- 
cgit v1.2.3


From 0563231f93c6d1f582b168a47753b345c1e20d81 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2022 18:44:54 -0400
Subject: tracing/events: Add __vstring() and __assign_vstr() helper macros

There's several places that open code the following logic:

  TP_STRUCT__entry(__dynamic_array(char, msg, MSG_MAX)),
  TP_fast_assign(vsnprintf(__get_str(msg), MSG_MAX, vaf->fmt, *vaf->va);)

To load a string created by variable array va_list.

The main issue with this approach is that "MSG_MAX" usage in the
__dynamic_array() portion. That actually just reserves the MSG_MAX in the
event, and even wastes space because there's dynamic meta data also saved
in the event to denote the offset and size of the dynamic array. It would
have been better to just use a static __array() field.

Instead, create __vstring() and __assign_vstr() that work like __string
and __assign_str() but instead of taking a destination string to copy,
take a format string and a va_list pointer and fill in the values.

It uses the helper:

 #define __trace_event_vstr_len(fmt, va)		\
 ({							\
	va_list __ap;					\
	int __ret;					\
							\
	va_copy(__ap, *(va));				\
	__ret = vsnprintf(NULL, 0, fmt, __ap) + 1;	\
	va_end(__ap);					\
							\
	min(__ret, TRACE_EVENT_STR_MAX);		\
 })

To figure out the length to store the string. It may be slightly slower as
it needs to run the vsnprintf() twice, but it now saves space on the ring
buffer.

Link: https://lkml.kernel.org/r/20220705224749.053570613@goodmis.org

Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Kalle Valo <kvalo@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Arend van Spriel <aspriel@gmail.com>
Cc: Franky Lin <franky.lin@broadcom.com>
Cc: Hante Meuleman <hante.meuleman@broadcom.com>
Cc: Gregory Greenman <gregory.greenman@intel.com>
Cc: Peter Chen <peter.chen@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mathias Nyman <mathias.nyman@intel.com>
Cc: Chunfeng Yun <chunfeng.yun@mediatek.com>
Cc: Bin Liu <b-liu@ti.com>
Cc: Marek Lindner <mareklindner@neomailbox.ch>
Cc: Simon Wunderlich <sw@simonwunderlich.de>
Cc: Antonio Quartulli <a@unstable.cc>
Cc: Sven Eckelmann <sven@narfation.org>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h                 | 18 ++++++++++++++++++
 include/trace/stages/stage1_struct_define.h  |  3 +++
 include/trace/stages/stage2_data_offsets.h   |  3 +++
 include/trace/stages/stage4_event_fields.h   |  3 +++
 include/trace/stages/stage5_get_offsets.h    |  4 ++++
 include/trace/stages/stage6_event_callback.h |  7 +++++++
 6 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index e6e95a9f07a5..b18759a673c6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -916,6 +916,24 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 
 #endif
 
+#define TRACE_EVENT_STR_MAX	512
+
+/*
+ * gcc warns that you can not use a va_list in an inlined
+ * function. But lets me make it into a macro :-/
+ */
+#define __trace_event_vstr_len(fmt, va)			\
+({							\
+	va_list __ap;					\
+	int __ret;					\
+							\
+	va_copy(__ap, *(va));				\
+	__ret = vsnprintf(NULL, 0, fmt, __ap) + 1;	\
+	va_end(__ap);					\
+							\
+	min(__ret, TRACE_EVENT_STR_MAX);		\
+})
+
 #endif /* _LINUX_TRACE_EVENT_H */
 
 /*
diff --git a/include/trace/stages/stage1_struct_define.h b/include/trace/stages/stage1_struct_define.h
index a16783419687..1b7bab60434c 100644
--- a/include/trace/stages/stage1_struct_define.h
+++ b/include/trace/stages/stage1_struct_define.h
@@ -26,6 +26,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1)
 
diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h
index 42fd1e8813ec..1b7a8f764fdd 100644
--- a/include/trace/stages/stage2_data_offsets.h
+++ b/include/trace/stages/stage2_data_offsets.h
@@ -32,6 +32,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
index e80cdc397a43..c3790ec7a453 100644
--- a/include/trace/stages/stage4_event_fields.h
+++ b/include/trace/stages/stage4_event_fields.h
@@ -38,6 +38,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h
index 7ee5931300e6..fba4c24ed9e6 100644
--- a/include/trace/stages/stage5_get_offsets.h
+++ b/include/trace/stages/stage5_get_offsets.h
@@ -39,6 +39,10 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item,		\
+		      __trace_event_vstr_len(fmt, ap))
+
 #undef __rel_dynamic_array
 #define __rel_dynamic_array(type, item, len)				\
 	__item_length = (len) * sizeof(type);				\
diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
index e1724f73594b..0f51f6b3ab70 100644
--- a/include/trace/stages/stage6_event_callback.h
+++ b/include/trace/stages/stage6_event_callback.h
@@ -24,6 +24,9 @@
 #undef __string_len
 #define __string_len(item, src, len) __dynamic_array(char, item, -1)
 
+#undef __vstring
+#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
+
 #undef __assign_str
 #define __assign_str(dst, src)						\
 	strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
@@ -35,6 +38,10 @@
 		__get_str(dst)[len] = '\0';				\
 	} while(0)
 
+#undef __assign_vstr
+#define __assign_vstr(dst, fmt, va)					\
+	vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, *(va))
+
 #undef __bitmask
 #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
 
-- 
cgit v1.2.3


From f484da847a01936e1d087a80cc96e8254492527c Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:03 -0700
Subject: net/mlx5: Expose the ability to point to any UID from shared UID

Expose shared_object_to_user_object_allowed, this capability
means an object created with shared UID can point to any UID.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8e87eb47f9dc..9321d774e2d8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1371,7 +1371,9 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x1f];
+	u8         reserved_at_0[0x10];
+	u8         shared_object_to_user_object_allowed[0x1];
+	u8         reserved_at_13[0xe];
 	u8         vhca_resource_manager[0x1];
 
 	u8         hca_cap_2[0x1];
@@ -8507,7 +8509,7 @@ struct mlx5_ifc_create_flow_table_out_bits {
 
 struct mlx5_ifc_create_flow_table_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
-- 
cgit v1.2.3


From 6c27c56cdc69608211617eea217e1a6cecccc675 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:04 -0700
Subject: net/mlx5: fs, expose flow table ID to users

Expose the flow table ID to users. This will be used by downstream
patches to allow creating steering rules that point to a flow table ID.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 6 ++++++
 include/linux/mlx5/fs.h                           | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 14187e50e2f9..1da3dc7c95fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1195,6 +1195,12 @@ struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 }
 EXPORT_SYMBOL(mlx5_create_flow_table);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft)
+{
+	return ft->id;
+}
+EXPORT_SYMBOL(mlx5_flow_table_id);
+
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     struct mlx5_flow_table_attr *ft_attr, u16 vport)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ece3e35622d7..eee07d416b56 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -315,4 +315,5 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				  struct mlx5_pkt_reformat *reformat);
 
+u32 mlx5_flow_table_id(struct mlx5_flow_table *ft);
 #endif
-- 
cgit v1.2.3


From b0bb369ee451323968b31392a86398f15a2ba183 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Sun, 3 Jul 2022 13:54:05 -0700
Subject: net/mlx5: fs, allow flow table creation with a UID

Add UID field to flow table attributes to allow creating flow tables
with a non default (zero) uid.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c         | 16 ++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h         |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c        |  2 +-
 .../net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c    |  1 +
 .../net/ethernet/mellanox/mlx5/core/steering/dr_table.c  |  8 +++++---
 .../net/ethernet/mellanox/mlx5/core/steering/dr_types.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c |  7 ++++---
 .../net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h    |  3 ++-
 include/linux/mlx5/fs.h                                  |  1 +
 9 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 735dc805dad7..e735e19461ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -50,10 +50,12 @@ static int mlx5_cmd_stub_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_stub_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					   struct mlx5_flow_table *ft,
-					   unsigned int size,
+					   struct mlx5_flow_table_attr *ft_attr,
 					   struct mlx5_flow_table *next_ft)
 {
-	ft->max_fte = size ? roundup_pow_of_two(size) : 1;
+	int max_fte = ft_attr->max_fte;
+
+	ft->max_fte = max_fte ? roundup_pow_of_two(max_fte) : 1;
 
 	return 0;
 }
@@ -258,7 +260,7 @@ static int mlx5_cmd_update_root_ft(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 				      struct mlx5_flow_table *ft,
-				      unsigned int size,
+				      struct mlx5_flow_table_attr *ft_attr,
 				      struct mlx5_flow_table *next_ft)
 {
 	int en_encap = !!(ft->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
@@ -267,17 +269,19 @@ static int mlx5_cmd_create_flow_table(struct mlx5_flow_root_namespace *ns,
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)] = {};
 	struct mlx5_core_dev *dev = ns->dev;
+	unsigned int size;
 	int err;
 
-	if (size != POOL_NEXT_SIZE)
-		size = roundup_pow_of_two(size);
-	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, size);
+	if (ft_attr->max_fte != POOL_NEXT_SIZE)
+		size = roundup_pow_of_two(ft_attr->max_fte);
+	size = mlx5_ft_pool_get_avail_sz(dev, ft->type, ft_attr->max_fte);
 	if (!size)
 		return -ENOSPC;
 
 	MLX5_SET(create_flow_table_in, in, opcode,
 		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
 
+	MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid);
 	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.level, ft->level);
 	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, size ? ilog2(size) : 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 274004e80f03..8ef4254b9ea1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -38,7 +38,7 @@
 struct mlx5_flow_cmds {
 	int (*create_flow_table)(struct mlx5_flow_root_namespace *ns,
 				 struct mlx5_flow_table *ft,
-				 unsigned int size,
+				 struct mlx5_flow_table_attr *ft_attr,
 				 struct mlx5_flow_table *next_ft);
 	int (*destroy_flow_table)(struct mlx5_flow_root_namespace *ns,
 				  struct mlx5_flow_table *ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 1da3dc7c95fa..35d89edb1bcd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1155,7 +1155,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 			      find_next_chained_ft(fs_prio);
 	ft->def_miss_action = ns->def_miss_action;
 	ft->ns = ns;
-	err = root->cmds->create_flow_table(root, ft, ft_attr->max_fte, next_ft);
+	err = root->cmds->create_flow_table(root, ft, ft_attr, next_ft);
 	if (err)
 		goto free_ft;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 223c8741b7ae..16d65fe4f654 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -439,6 +439,7 @@ int mlx5dr_cmd_create_flow_table(struct mlx5_core_dev *mdev,
 
 	MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE);
 	MLX5_SET(create_flow_table_in, in, table_type, attr->table_type);
+	MLX5_SET(create_flow_table_in, in, uid, attr->uid);
 
 	ft_mdev = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context);
 	MLX5_SET(flow_table_context, ft_mdev, termination_table, attr->term_tbl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
index e5f6412baea9..31d443dd8386 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_table.c
@@ -214,7 +214,7 @@ static int dr_table_destroy_sw_owned_tbl(struct mlx5dr_table *tbl)
 					     tbl->table_type);
 }
 
-static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
+static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl, u16 uid)
 {
 	bool en_encap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT);
 	bool en_decap = !!(tbl->flags & MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
@@ -236,6 +236,7 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	ft_attr.sw_owner = true;
 	ft_attr.decap_en = en_decap;
 	ft_attr.reformat_en = en_encap;
+	ft_attr.uid = uid;
 
 	ret = mlx5dr_cmd_create_flow_table(tbl->dmn->mdev, &ft_attr,
 					   NULL, &tbl->table_id);
@@ -243,7 +244,8 @@ static int dr_table_create_sw_owned_tbl(struct mlx5dr_table *tbl)
 	return ret;
 }
 
-struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u32 flags)
+struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level,
+					 u32 flags, u16 uid)
 {
 	struct mlx5dr_table *tbl;
 	int ret;
@@ -263,7 +265,7 @@ struct mlx5dr_table *mlx5dr_table_create(struct mlx5dr_domain *dmn, u32 level, u
 	if (ret)
 		goto free_tbl;
 
-	ret = dr_table_create_sw_owned_tbl(tbl);
+	ret = dr_table_create_sw_owned_tbl(tbl, uid);
 	if (ret)
 		goto uninit_tbl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 98320e3945ad..50b0dd4fb4a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -1200,6 +1200,7 @@ struct mlx5dr_cmd_query_flow_table_details {
 
 struct mlx5dr_cmd_create_flow_table_attr {
 	u32 table_type;
+	u16 uid;
 	u64 icm_addr_rx;
 	u64 icm_addr_tx;
 	u8 level;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index 6a9abba92df6..c30ed8e18458 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -62,7 +62,7 @@ static int set_miss_action(struct mlx5_flow_root_namespace *ns,
 
 static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 					 struct mlx5_flow_table *ft,
-					 unsigned int size,
+					 struct mlx5_flow_table_attr *ft_attr,
 					 struct mlx5_flow_table *next_ft)
 {
 	struct mlx5dr_table *tbl;
@@ -71,7 +71,7 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 
 	if (mlx5_dr_is_fw_table(ft->flags))
 		return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft,
-								    size,
+								    ft_attr,
 								    next_ft);
 	flags = ft->flags;
 	/* turn off encap/decap if not supported for sw-str by fw */
@@ -79,7 +79,8 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns,
 		flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 				      MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags);
+	tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags,
+				  ft_attr->uid);
 	if (!tbl) {
 		mlx5_core_err(ns->dev, "Failed creating dr flow_table\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
index 7626c85643b1..3bb14860b36d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h
@@ -51,7 +51,8 @@ void mlx5dr_domain_set_peer(struct mlx5dr_domain *dmn,
 			    struct mlx5dr_domain *peer_dmn);
 
 struct mlx5dr_table *
-mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags);
+mlx5dr_table_create(struct mlx5dr_domain *domain, u32 level, u32 flags,
+		    u16 uid);
 
 struct mlx5dr_table *
 mlx5dr_table_get_from_fs_ft(struct mlx5_flow_table *ft);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index eee07d416b56..8e73c377da2c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -178,6 +178,7 @@ struct mlx5_flow_table_attr {
 	int max_fte;
 	u32 level;
 	u32 flags;
+	u16 uid;
 	struct mlx5_flow_table *next_ft;
 
 	struct {
-- 
cgit v1.2.3


From 335e52c28cf9954d65b819cb68912fd32de3c844 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 1 Jul 2022 17:16:19 +0800
Subject: mm: Add PAGE_ALIGN_DOWN macro

This is just the same as PAGE_ALIGN(), but rounds the address down, not
up.

Suggested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..bd58ee018555 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -221,6 +221,9 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
 
+/* to align the pointer to the (prev) page boundary */
+#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)
+
 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
 #define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
 
-- 
cgit v1.2.3


From 6077c943beee407168f72ece745b0aeaef6b896f Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:08 -0500
Subject: mm: rename is_pinnable_page() to is_longterm_pinnable_page()

Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory
mapping", v9.

This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.

System stability and performance are not affected according to our ongoing
testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory (aka VRAM)
as SPM (special purpose memory) in the UEFI system address map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages.  The initial user for
this hardware page migration capability is the Frontier supercomputer
project.  This functionality is not AMD-specific.  We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration, with
.5 TB of system memory plus 256 GB of device memory split across 4 GPUs,
all in a single coherent address space.  Page migration is expected to
improve application efficiency significantly.  We will report empirical
results as they become available.

Coherent device type pages at gup are now migrated back to system memory
if they are being pinned long-term (FOLL_LONGTERM).  The reason is, that
long-term pinning would interfere with the device memory manager owning
the device-coherent pages (e.g.  evictions in TTM).  These series
incorporate Alistair Popple patches to do this migration from
pin_user_pages() calls.  hmm_gup_test has been added to hmm-test to test
different get user pages calls.

This series includes handling of device-managed anonymous pages returned
by vm_normal_pages.  Although they behave like normal pages for purposes
of mapping in CPU page tables and for COW, they do not support LRU lists,
NUMA migration or THP.

We also introduced a FOLL_LRU flag that adds the same behaviour to
follow_page and related APIs, to allow callers to specify that they expect
to put pages on an LRU list.


This patch (of 14):

is_pinnable_page() and folio_is_pinnable() are renamed to
is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively.
These functions are used in the FOLL_LONGTERM flag context.

Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com
Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 mm/gup.c           | 4 ++--
 mm/gup_test.c      | 2 +-
 mm/hugetlb.c       | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9cc02a7e503b..3c044e38958c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1607,7 +1607,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
 
 /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 #ifdef CONFIG_MIGRATION
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 #ifdef CONFIG_CMA
 	int mt = get_pageblock_migratetype(page);
@@ -1618,15 +1618,15 @@ static inline bool is_pinnable_page(struct page *page)
 	return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
 }
 #else
-static inline bool is_pinnable_page(struct page *page)
+static inline bool is_longterm_pinnable_page(struct page *page)
 {
 	return true;
 }
 #endif
 
-static inline bool folio_is_pinnable(struct folio *folio)
+static inline bool folio_is_longterm_pinnable(struct folio *folio)
 {
-	return is_pinnable_page(&folio->page);
+	return is_longterm_pinnable_page(&folio->page);
 }
 
 static inline void set_page_zone(struct page *page, enum zone_type zone)
diff --git a/mm/gup.c b/mm/gup.c
index 3129b754ade3..a9940e3b3181 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
 		 * path.
 		 */
 		if (unlikely((flags & FOLL_LONGTERM) &&
-			     !is_pinnable_page(page)))
+			     !is_longterm_pinnable_page(page)))
 			return NULL;
 
 		/*
@@ -1923,7 +1923,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
 			continue;
 		prev_folio = folio;
 
-		if (folio_is_pinnable(folio))
+		if (folio_is_longterm_pinnable(folio))
 			continue;
 
 		/*
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..12b0a91767d3 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
 				dump_page(page, "gup_test failure");
 				break;
 			} else if (cmd == PIN_LONGTERM_BENCHMARK &&
-				WARN(!is_pinnable_page(page),
+				WARN(!is_longterm_pinnable_page(page),
 				     "pages[%lu] is NOT pinnable but pinned\n",
 				     i)) {
 				dump_page(page, "gup_test failure");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 064da8ffbac6..ffdf3fc4a83f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1129,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 	lockdep_assert_held(&hugetlb_lock);
 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-		if (pin && !is_pinnable_page(page))
+		if (pin && !is_longterm_pinnable_page(page))
 			continue;
 
 		if (PageHWPoison(page))
-- 
cgit v1.2.3


From 5bb88dc571b1cbf0284100a317fb21ab7d03e40c Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:09 -0500
Subject: mm: move page zone helpers from mm.h to mmzone.h

It makes more sense to have these helpers in zone specific header
file, rather than the generic mm.h

Link: https://lkml.kernel.org/r/20220715150521.18165-3-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h |  2 +-
 include/linux/mm.h       | 78 ----------------------------------------------
 include/linux/mmzone.h   | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9f5ee49482de..732dde5988fb 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_MEMREMAP_H_
 #define _LINUX_MEMREMAP_H_
 
-#include <linux/mm.h>
+#include <linux/mmzone.h>
 #include <linux/range.h>
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c044e38958c..a2d01e49253b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1045,84 +1045,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  *   back into memory.
  */
 
-/*
- * The zone field is never updated after free_area_init_core()
- * sets it, so none of the operations on it need to be atomic.
- */
-
-/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
-#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
-#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
-#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
-#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
-#define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
-
-/*
- * Define the bit shifts to access each section.  For non-existent
- * sections we define the shift as 0; that plus a 0 mask ensures
- * the compiler will optimise away reference to them.
- */
-#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
-#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
-#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
-#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
-#define KASAN_TAG_PGSHIFT	(KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
-
-/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF)? \
-						SECTIONS_PGOFF : ZONES_PGOFF)
-#else
-#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
-#define ZONEID_PGOFF		((NODES_PGOFF < ZONES_PGOFF)? \
-						NODES_PGOFF : ZONES_PGOFF)
-#endif
-
-#define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
-
-#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
-#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
-#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
-#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_SHIFT) - 1)
-#define KASAN_TAG_MASK		((1UL << KASAN_TAG_WIDTH) - 1)
-#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
-
-static inline enum zone_type page_zonenum(const struct page *page)
-{
-	ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
-	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
-}
-
-static inline enum zone_type folio_zonenum(const struct folio *folio)
-{
-	return page_zonenum(&folio->page);
-}
-
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_zone_device_page(const struct page *page)
-{
-	return page_zonenum(page) == ZONE_DEVICE;
-}
-extern void memmap_init_zone_device(struct zone *, unsigned long,
-				    unsigned long, struct dev_pagemap *);
-#else
-static inline bool is_zone_device_page(const struct page *page)
-{
-	return false;
-}
-#endif
-
-static inline bool folio_is_zone_device(const struct folio *folio)
-{
-	return is_zone_device_page(&folio->page);
-}
-
-static inline bool is_zone_movable_page(const struct page *page)
-{
-	return page_zonenum(page) == ZONE_MOVABLE;
-}
-
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 735bf5b37949..5da1135e6755 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone)
 	return zone->spanned_pages == 0;
 }
 
+#ifndef BUILD_VDSO32_64
+/*
+ * The zone field is never updated after free_area_init_core()
+ * sets it, so none of the operations on it need to be atomic.
+ */
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+#define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
+#define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+
+/*
+ * Define the bit shifts to access each section.  For non-existent
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
+#define LAST_CPUPID_PGSHIFT	(LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
+#define KASAN_TAG_PGSHIFT	(KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
+
+/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+#define ZONEID_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#define ZONEID_PGOFF		((SECTIONS_PGOFF < ZONES_PGOFF) ? \
+						SECTIONS_PGOFF : ZONES_PGOFF)
+#else
+#define ZONEID_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#define ZONEID_PGOFF		((NODES_PGOFF < ZONES_PGOFF) ? \
+						NODES_PGOFF : ZONES_PGOFF)
+#endif
+
+#define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
+
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
+#define LAST_CPUPID_MASK	((1UL << LAST_CPUPID_SHIFT) - 1)
+#define KASAN_TAG_MASK		((1UL << KASAN_TAG_WIDTH) - 1)
+#define ZONEID_MASK		((1UL << ZONEID_SHIFT) - 1)
+
+static inline enum zone_type page_zonenum(const struct page *page)
+{
+	ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+}
+
+static inline enum zone_type folio_zonenum(const struct folio *folio)
+{
+	return page_zonenum(&folio->page);
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_zone_device_page(const struct page *page)
+{
+	return page_zonenum(page) == ZONE_DEVICE;
+}
+extern void memmap_init_zone_device(struct zone *, unsigned long,
+				    unsigned long, struct dev_pagemap *);
+#else
+static inline bool is_zone_device_page(const struct page *page)
+{
+	return false;
+}
+#endif
+
+static inline bool folio_is_zone_device(const struct folio *folio)
+{
+	return is_zone_device_page(&folio->page);
+}
+
+static inline bool is_zone_movable_page(const struct page *page)
+{
+	return page_zonenum(page) == ZONE_MOVABLE;
+}
+#endif
+
 /*
  * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
  * intersection with the given zone
-- 
cgit v1.2.3


From f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:10 -0500
Subject: mm: add zone device coherent type memory support

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI or
CXL).  Any page of a process can be migrated to such memory.  However, no
one should be allowed to pin such memory so that it can always be evicted.

[hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page]
Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memremap.h | 19 +++++++++++++++++++
 include/linux/mm.h       |  5 ++++-
 mm/memcontrol.c          |  7 ++++---
 mm/memory-failure.c      |  8 ++++++--
 mm/memremap.c            | 10 ++++++++++
 mm/migrate_device.c      | 16 +++++++---------
 mm/rmap.c                |  5 +++--
 7 files changed, 53 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 732dde5988fb..09320b7f706c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/mm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -61,6 +68,7 @@ struct vmem_altmap {
 enum memory_type {
 	/* 0 is reserved to catch uninitialized type fields */
 	MEMORY_DEVICE_PRIVATE = 1,
+	MEMORY_DEVICE_COHERENT,
 	MEMORY_DEVICE_FS_DAX,
 	MEMORY_DEVICE_GENERIC,
 	MEMORY_DEVICE_PCI_P2PDMA,
@@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool folio_is_device_coherent(const struct folio *folio)
+{
+	return is_device_coherent_page(&folio->page);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2d01e49253b..64393ed3330a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -28,6 +28,7 @@
 #include <linux/sched.h>
 #include <linux/pgtable.h>
 #include <linux/kasan.h>
+#include <linux/memremap.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page)
 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
 		return false;
 #endif
-	return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page));
+	return !(is_device_coherent_page(page) ||
+		 is_zone_movable_page(page) ||
+		 is_zero_pfn(page_to_pfn(page)));
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1497affe08c4..b1868784f895 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5716,8 +5716,8 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
- *     (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  *     For now we such page is charge like a regular page would be as for all
  *     intent and purposes it is just special memory taking the place of a
  *     regular page.
@@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page_memcg(page) == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page))
+			if (is_device_private_page(page) ||
+			    is_device_coherent_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f7612ccdb299..b7ca5db7e60e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		goto unlock;
 	}
 
-	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+	case MEMORY_DEVICE_COHERENT:
 		/*
-		 * TODO: Handle HMM pages which may need coordination
+		 * TODO: Handle device pages which may need coordination
 		 * with device-side memory.
 		 */
 		goto unlock;
+	default:
+		break;
 	}
 
 	/*
diff --git a/mm/memremap.c b/mm/memremap.c
index 8b5c8fd4ea8e..f0955785150f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			return ERR_PTR(-EINVAL);
 		}
 		break;
+	case MEMORY_DEVICE_COHERENT:
+		if (!pgmap->ops->page_free) {
+			WARN(1, "Missing page_free method\n");
+			return ERR_PTR(-EINVAL);
+		}
+		if (!pgmap->owner) {
+			WARN(1, "Missing owner\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
 			WARN(1, "File system DAX not supported\n");
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5052093d0262..a4847ad65da3 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
  *     handle_pte_fault()
  *       do_anonymous_page()
  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
  */
 static void migrate_vma_insert_page(struct migrate_vma *migrate,
 				    unsigned long addr,
@@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 						page_to_pfn(page));
 		entry = swp_entry_to_pte(swp_entry);
 	} else {
-		/*
-		 * For now we only support migrating to un-addressable device
-		 * memory.
-		 */
-		if (is_zone_device_page(page)) {
+		if (is_zone_device_page(page) &&
+		    !is_device_coherent_page(page)) {
 			pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
 			goto abort;
 		}
@@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate)
 
 		mapping = page_mapping(page);
 
-		if (is_device_private_page(newpage)) {
+		if (is_device_private_page(newpage) ||
+		    is_device_coherent_page(newpage)) {
 			/*
-			 * For now only support private anonymous when migrating
-			 * to un-addressable device memory.
+			 * For now only support anonymous memory migrating to
+			 * device private or coherent memory.
 			 */
 			if (mapping) {
 				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
diff --git a/mm/rmap.c b/mm/rmap.c
index 83172ee0ea35..0532fd92ecb3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 		/* Update high watermark before we lower rss */
 		update_hiwater_rss(mm);
 
-		if (folio_is_zone_device(folio)) {
+		if (folio_is_device_private(folio)) {
 			unsigned long pfn = folio_pfn(folio);
 			swp_entry_t entry;
 			pte_t swp_pte;
@@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
 					TTU_SYNC)))
 		return;
 
-	if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
+	if (folio_is_zone_device(folio) &&
+	    (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
 		return;
 
 	/*
-- 
cgit v1.2.3


From dd19e6d8ffaa1289d75d7833de97faf1b6b2c8e4 Mon Sep 17 00:00:00 2001
From: Alex Sierra <alex.sierra@amd.com>
Date: Fri, 15 Jul 2022 10:05:12 -0500
Subject: mm: add device coherent vma selection for memory migration

This case is used to migrate pages from device memory, back to system
memory.  Device coherent type memory is cache coherent from device and CPU
point of view.

Link: https://lkml.kernel.org/r/20220715150521.18165-6-alex.sierra@amd.com
Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alistair Poppple <apopple@nvidia.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  1 +
 mm/migrate_device.c     | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 069a89e847f3..b84908debe5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
 	MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
 	MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+	MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index a4847ad65da3..18bc6483f63a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ again:
 			if (is_writable_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
-			if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-				goto next;
 			pfn = pte_pfn(pte);
-			if (is_zero_pfn(pfn)) {
+			if (is_zero_pfn(pfn) &&
+			    (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
 				mpfn = MIGRATE_PFN_MIGRATE;
 				migrate->cpages++;
 				goto next;
 			}
 			page = vm_normal_page(migrate->vma, addr, pte);
+			if (page && !is_zone_device_page(page) &&
+			    !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+				goto next;
+			else if (page && is_device_coherent_page(page) &&
+			    (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+			     page->pgmap->owner != migrate->pgmap_owner))
+				goto next;
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
-- 
cgit v1.2.3


From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:25 +0800
Subject: dax: introduce holder for dax_device

Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2.

The patchset fsdax-rmap is aimed to support shared pages tracking for
fsdax.

It moves owner tracking from dax_assocaite_entry() to pmem device driver,
by introducing an interface ->memory_failure() for struct pagemap.  This
interface is called by memory_failure() in mm, and implemented by pmem
device.

Then call holder operations to find the filesystem which the corrupted
data located in, and call filesystem handler to track files or metadata
associated with this page.

Finally we are able to try to fix the corrupted data in filesystem and do
other necessary processing, such as killing processes who are using the
files affected.

The call trace is like this:
memory_failure()
|* fsdax case
|------------
|pgmap->ops->memory_failure()      => pmem_pgmap_memory_failure()
| dax_holder_notify_failure()      =>
|  dax_device->holder_ops->notify_failure() =>
|                                     - xfs_dax_notify_failure()
|  |* xfs_dax_notify_failure()
|  |--------------------------
|  |   xfs_rmap_query_range()
|  |    xfs_dax_failure_fn()
|  |    * corrupted on metadata
|  |       try to recover data, call xfs_force_shutdown()
|  |    * corrupted on file data
|  |       try to recover data, call mf_dax_kill_procs()
|* normal case
|-------------
|mf_generic_kill_procs()


The patchset fsdax-reflink attempts to add CoW support for fsdax, and
takes XFS, which has both reflink and fsdax features, as an example.

One of the key mechanisms needed to be implemented in fsdax is CoW.  Copy
the data from srcmap before we actually write data to the destination
iomap.  And we just copy range in which data won't be changed.

Another mechanism is range comparison.  In page cache case, readpage() is
used to load data on disk to page cache in order to be able to compare
data.  In fsdax case, readpage() does not work.  So, we need another
compare data with direct access support.

With the two mechanisms implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.


This patch (of 14):

To easily track filesystem from a pmem device, we introduce a holder for
dax_device structure, and also its operation.  This holder is used to
remember who is using this dax_device:

 - When it is the backend of a filesystem, the holder will be the
   instance of this filesystem.
 - When this pmem device is one of the targets in a mapped device, the
   holder will be this mapped device.  In this case, the mapped device
   has its own dax_device and it will follow the first rule.  So that we
   can finally track to the filesystem we needed.

The holder and holder_ops will be set when filesystem is being mounted,
or an target device is being activated.

Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com
Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.wiliams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c     |  2 +-
 fs/erofs/super.c    | 10 ++++----
 fs/ext2/super.c     |  7 +++---
 fs/ext4/super.c     |  9 +++----
 fs/xfs/xfs_buf.c    |  5 ++--
 include/linux/dax.h | 33 +++++++++++++++++++-------
 7 files changed, 110 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 50a08b2ec247..9b5e2a5eb0ae 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -22,6 +22,8 @@
  * @private: dax driver private data
  * @flags: state and boolean properties
  * @ops: operations for this device
+ * @holder_data: holder of a dax_device: could be filesystem or mapped device
+ * @holder_ops: operations for the inner holder
  */
 struct dax_device {
 	struct inode inode;
@@ -29,6 +31,8 @@ struct dax_device {
 	void *private;
 	unsigned long flags;
 	const struct dax_operations *ops;
+	void *holder_data;
+	const struct dax_holder_operations *holder_ops;
 };
 
 static dev_t dax_devt;
@@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
  * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
  * @bdev: block device to find a dax_device for
  * @start_off: returns the byte offset into the dax_device that @bdev starts
+ * @holder: filesystem or mapped device inside the dax_device
+ * @ops: operations for the inner holder
  */
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+		void *holder, const struct dax_holder_operations *ops)
 {
 	struct dax_device *dax_dev;
 	u64 part_size;
@@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
 	dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
 	if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
 		dax_dev = NULL;
+	else if (holder) {
+		if (!cmpxchg(&dax_dev->holder_data, NULL, holder))
+			dax_dev->holder_ops = ops;
+		else
+			dax_dev = NULL;
+	}
 	dax_read_unlock(id);
 
 	return dax_dev;
 }
 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
+void fs_put_dax(struct dax_device *dax_dev, void *holder)
+{
+	if (dax_dev && holder &&
+	    cmpxchg(&dax_dev->holder_data, holder, NULL) == holder)
+		dax_dev->holder_ops = NULL;
+	put_dax(dax_dev);
+}
+EXPORT_SYMBOL_GPL(fs_put_dax);
 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
 
 enum dax_device_flags {
@@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 }
 EXPORT_SYMBOL_GPL(dax_recovery_write);
 
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off,
+			      u64 len, int mf_flags)
+{
+	int rc, id;
+
+	id = dax_read_lock();
+	if (!dax_alive(dax_dev)) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (!dax_dev->holder_ops) {
+		rc = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags);
+out:
+	dax_read_unlock(id);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(dax_holder_notify_failure);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev)
 	if (!dax_dev)
 		return;
 
+	if (dax_dev->holder_data != NULL)
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
+
+	/* clear holder data */
+	dax_dev->holder_ops = NULL;
+	dax_dev->holder_data = NULL;
 }
 EXPORT_SYMBOL_GPL(kill_dax);
 
@@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+/**
+ * dax_holder() - obtain the holder of a dax device
+ * @dax_dev: a dax_device instance
+
+ * Return: the holder's data which represents the holder if registered,
+ * otherwize NULL.
+ */
+void *dax_holder(struct dax_device *dax_dev)
+{
+	return dax_dev->holder_data;
+}
+EXPORT_SYMBOL_GPL(dax_holder);
+
 /**
  * inode_dax: convert a public inode into its dax_dev
  * @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2b75f1ef7386..0177a4ce9a18 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
 	}
 
 	td->dm_dev.bdev = bdev;
-	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
+	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
 	return 0;
 }
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 95addc5c9d34..3173debeaa5a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 		if (IS_ERR(bdev))
 			return PTR_ERR(bdev);
 		dif->bdev = bdev;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+		dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+						  NULL, NULL);
 	}
 
 	dif->blocks = le32_to_cpu(dis->blocks);
@@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		}
 
 		sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
-						  &sbi->dax_part_off);
+						  &sbi->dax_part_off,
+						  NULL, NULL);
 	}
 
 	err = erofs_read_superblock(sb);
@@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 {
 	struct erofs_device_info *dif = ptr;
 
-	fs_put_dax(dif->dax_dev);
+	fs_put_dax(dif->dax_dev, NULL);
 	if (dif->bdev)
 		blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
 	erofs_fscache_unregister_cookie(&dif->fscache);
@@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
 		return;
 
 	erofs_free_dev_context(sbi->devs);
-	fs_put_dax(sbi->dax_dev);
+	fs_put_dax(sbi->dax_dev, NULL);
 	erofs_fscache_unregister_cookie(&sbi->s_fscache);
 	erofs_fscache_unregister_fs(sb);
 	kfree(sbi->opt.fsid);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..4638946251b9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 }
 
@@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
-	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+					   NULL, NULL);
 
 	spin_lock_init(&sbi->s_lock);
 	ret = -EINVAL;
@@ -1204,7 +1205,7 @@ failed_mount_group_desc:
 failed_mount:
 	brelse(bh);
 failed_sbi:
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 845f2f8aee5f..1f8bf507ba5a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 #if IS_ENABLED(CONFIG_UNICODE)
 	utf8_unload(sb->s_encoding);
@@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
 		return;
 
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 }
 
@@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
 	if (!sbi)
 		return NULL;
 
-	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+					   NULL, NULL);
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
 	sbi->s_sb = sb;
 	return sbi;
 err_out:
-	fs_put_dax(sbi->s_daxdev);
+	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 	return NULL;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4aa9c9cf5b6e..1ec2a7b6d44e 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1911,7 +1911,7 @@ xfs_free_buftarg(
 	list_lru_destroy(&btp->bt_lru);
 
 	blkdev_issue_flush(btp->bt_bdev);
-	fs_put_dax(btp->bt_daxdev);
+	fs_put_dax(btp->bt_daxdev, NULL);
 
 	kmem_free(btp);
 }
@@ -1964,7 +1964,8 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
-	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL,
+					    NULL);
 
 	/*
 	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e7b81634c52a..cf85fc36da5f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -43,8 +43,21 @@ struct dax_operations {
 			void *addr, size_t bytes, struct iov_iter *iter);
 };
 
+struct dax_holder_operations {
+	/*
+	 * notify_failure - notify memory failure into inner holder device
+	 * @dax_dev: the dax device which contains the holder
+	 * @offset: offset on this dax device where memory failure occurs
+	 * @len: length of this memory failure event
+	 * @flags: action flags for memory failure handler
+	 */
+	int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
+			u64 len, int mf_flags);
+};
+
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+void *dax_holder(struct dax_device *dax_dev);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
 	return dax_synchronous(dax_dev);
 }
 #else
+static inline void *dax_holder(struct dax_device *dax_dev)
+{
+	return NULL;
+}
 static inline struct dax_device *alloc_dax(void *private,
 		const struct dax_operations *ops)
 {
@@ -114,12 +131,9 @@ struct writeback_control;
 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
 void dax_remove_host(struct gendisk *disk);
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
-		u64 *start_off);
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
-	put_dax(dax_dev);
-}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
+		void *holder, const struct dax_holder_operations *ops);
+void fs_put_dax(struct dax_device *dax_dev, void *holder);
 #else
 static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
 {
@@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk)
 {
 }
 static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
-		u64 *start_off)
+		u64 *start_off, void *holder,
+		const struct dax_holder_operations *ops)
 {
 	return NULL;
 }
-static inline void fs_put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
 {
 }
 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
@@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
 			size_t nr_pages);
+int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
+		int mf_flags);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-- 
cgit v1.2.3


From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:27 +0800
Subject: pagemap,pmem: introduce ->memory_failure()

When memory-failure occurs, we call this function which is implemented by
each kind of devices.  For the fsdax case, pmem device driver implements
it.  Pmem device driver will find out the filesystem in which the
corrupted page located in.

With dax_holder notify support, we are able to notify the memory failure
from pmem driver to upper layers.  If there is something not support in
the notify routine, memory_failure will fall back to the generic hanlder.

Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/nvdimm/pmem.c    | 17 +++++++++++++++++
 include/linux/memremap.h | 12 ++++++++++++
 mm/memory-failure.c      | 14 ++++++++++++++
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 629d10fcf53b..107c9cb3d57d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem)
 	blk_cleanup_disk(pmem->disk);
 }
 
+static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
+		unsigned long pfn, unsigned long nr_pages, int mf_flags)
+{
+	struct pmem_device *pmem =
+			container_of(pgmap, struct pmem_device, pgmap);
+	u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset;
+	u64 len = nr_pages << PAGE_SHIFT;
+
+	return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags);
+}
+
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.memory_failure		= pmem_pagemap_memory_failure,
+};
+
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.range.end = res->end;
 		pmem->pgmap.nr_range = 1;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		bb_range = pmem->pgmap.range;
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 09320b7f706c..19010491a603 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -87,6 +87,18 @@ struct dev_pagemap_ops {
 	 * the page back to a CPU accessible page.
 	 */
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+
+	/*
+	 * Handle the memory failure happens on a range of pfns.  Notify the
+	 * processes who are using these pfns, and try to recover the data on
+	 * them if necessary.  The mf_flags is finally passed to the recover
+	 * function through the whole notify routine.
+	 *
+	 * When this is not implemented, or it returns -EOPNOTSUPP, the caller
+	 * will fall back to a common handler called mf_generic_kill_procs().
+	 */
+	int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
+			      unsigned long nr_pages, int mf_flags);
 };
 
 #define PGMAP_ALTMAP_VALID	(1 << 0)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f8a8a5d45eba..46c77151f726 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1748,6 +1748,20 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	if (!pgmap_pfn_valid(pgmap, pfn))
 		goto out;
 
+	/*
+	 * Call driver's implementation to handle the memory failure, otherwise
+	 * fall back to generic handler.
+	 */
+	if (pgmap->ops->memory_failure) {
+		rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
+		/*
+		 * Fall back to generic handler too if operation is not
+		 * supported inside the driver/device/filesystem.
+		 */
+		if (rc != -EOPNOTSUPP)
+			goto out;
+	}
+
 	rc = mf_generic_kill_procs(pfn, flags, pgmap);
 out:
 	/* drop pgmap ref acquired in caller */
-- 
cgit v1.2.3


From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:28 +0800
Subject: fsdax: introduce dax_lock_mapping_entry()

The current dax_lock_page() locks dax entry by obtaining mapping and index
in page.  To support 1-to-N RMAP in NVDIMM, we need a new function to lock
a specific dax entry corresponding to this file's mapping,index.  And
output the page corresponding to the specific dax entry for caller use.

Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c            | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h | 15 +++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 4155a6107fa1..65e44d78b3bb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -455,6 +455,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
 	dax_unlock_entry(&xas, (void *)cookie);
 }
 
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+		struct page **page)
+{
+	XA_STATE(xas, NULL, 0);
+	void *entry;
+
+	rcu_read_lock();
+	for (;;) {
+		entry = NULL;
+		if (!dax_mapping(mapping))
+			break;
+
+		xas.xa = &mapping->i_pages;
+		xas_lock_irq(&xas);
+		xas_set(&xas, index);
+		entry = xas_load(&xas);
+		if (dax_is_locked(entry)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(&xas, entry);
+			rcu_read_lock();
+			continue;
+		}
+		if (!entry ||
+		    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+			/*
+			 * Because we are looking for entry from file's mapping
+			 * and index, so the entry may not be inserted for now,
+			 * or even a zero/empty entry.  We don't think this is
+			 * an error case.  So, return a special value and do
+			 * not output @page.
+			 */
+			entry = (void *)~0UL;
+		} else {
+			*page = pfn_to_page(dax_to_pfn(entry));
+			dax_lock_entry(&xas, entry);
+		}
+		xas_unlock_irq(&xas);
+		break;
+	}
+	rcu_read_unlock();
+	return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+		dax_entry_t cookie)
+{
+	XA_STATE(xas, &mapping->i_pages, index);
+
+	if (cookie == ~0UL)
+		return;
+
+	dax_unlock_entry(&xas, (void *)cookie);
+}
+
 /*
  * Find page cache entry at given index. If it is a DAX entry, return it
  * with the entry locked. If the page cache doesn't contain an entry at
diff --git a/include/linux/dax.h b/include/linux/dax.h
index cf85fc36da5f..7116681b48c0 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
 struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+		unsigned long index, struct page **page);
+void dax_unlock_mapping_entry(struct address_space *mapping,
+		unsigned long index, dax_entry_t cookie);
 #else
 static inline struct page *dax_layout_busy_page(struct address_space *mapping)
 {
@@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page)
 static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
 {
 }
+
+static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
+		unsigned long index, struct page **page)
+{
+	return 0;
+}
+
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+		unsigned long index, dax_entry_t cookie)
+{
+}
 #endif
 
 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-- 
cgit v1.2.3


From c36e2024957120566efd99395b5c8cc95b5175c1 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:29 +0800
Subject: mm: introduce mf_dax_kill_procs() for fsdax case

This new function is a variant of mf_generic_kill_procs that accepts a
file, offset pair instead of a struct to support multiple files sharing a
DAX mapping.  It is intended to be called by the file systems as part of
the memory_failure handler after the file system performed a reverse
mapping from the storage address to the file and file offset.

Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  |  2 ++
 mm/memory-failure.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64393ed3330a..d4ebfc206e2b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3178,6 +3178,8 @@ enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 };
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+		      unsigned long count, int mf_flags);
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46c77151f726..c9931c676335 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -297,10 +297,9 @@ void shake_page(struct page *p)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+		unsigned long address)
 {
-	unsigned long address = vma_address(page, vma);
 	unsigned long ret = 0;
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
 /*
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ *   In other cases, such as anonymous and file-backend page, the address to be
+ *   killed can be caculated by @p itself.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+			pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+			struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 	}
 
 	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
+	if (is_zone_device_page(p)) {
+		/*
+		 * Since page->mapping is not used for fsdax, we need
+		 * calculate the address based on the vma.
+		 */
+		if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+	} else
 		tk->size_shift = page_shift(compound_head(p));
 
 	/*
@@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -541,13 +550,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * to be informed of all such data corruptions.
 			 */
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, vma, to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
 	i_mmap_unlock_read(mapping);
 }
 
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+		struct address_space *mapping, pgoff_t pgoff,
+		struct list_head *to_kill)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+
+	i_mmap_lock_read(mapping);
+	read_lock(&tasklist_lock);
+	for_each_process(tsk) {
+		struct task_struct *t = task_early_kill(tsk, true);
+
+		if (!t)
+			continue;
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+			if (vma->vm_mm == t->mm)
+				add_to_kill(t, page, pgoff, vma, to_kill);
+		}
+	}
+	read_unlock(&tasklist_lock);
+	i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
@@ -1588,6 +1625,45 @@ unlock:
 	return rc;
 }
 
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping:	address_space of the file in use
+ * @index:	start pgoff of the range within the file
+ * @count:	length of the range, in unit of PAGE_SIZE
+ * @mf_flags:	memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+		unsigned long count, int mf_flags)
+{
+	LIST_HEAD(to_kill);
+	dax_entry_t cookie;
+	struct page *page;
+	size_t end = index + count;
+
+	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+	for (; index < end; index++) {
+		page = NULL;
+		cookie = dax_lock_mapping_entry(mapping, index, &page);
+		if (!cookie)
+			return -EBUSY;
+		if (!page)
+			goto unlock;
+
+		SetPageHWPoison(page);
+
+		collect_procs_fsdax(page, mapping, index, &to_kill);
+		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+				index, mf_flags);
+unlock:
+		dax_unlock_mapping_entry(mapping, index, cookie);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
 /*
  * Called from hugetlb code with hugetlb_lock held.
  *
-- 
cgit v1.2.3


From 6061b69b9a550a2ab84e805d0d2315ba6215f112 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:31 +0800
Subject: fsdax: set a CoW flag when associate reflink mappings

Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file
mappings.  In this case, since the dax-rmap has already took the
responsibility to look up for shared files by given dax page, the
page->mapping is no longer to used for rmap but for marking that this dax
page is shared.  And to make sure disassociation works fine, we use
page->index as refcount, and clear page->mapping to the initial state when
page->index is decreased to 0.

With the help of this new flag, it is able to distinguish normal case and
CoW case, and keep the warning in normal case.

Link: https://lkml.kernel.org/r/20220603053738.1218681-8-ruansy.fnst@fujitsu.com
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                   | 50 +++++++++++++++++++++++++++++++++++++---------
 include/linux/page-flags.h |  6 ++++++
 2 files changed, 47 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 65e44d78b3bb..b59b864017ad 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
 	for (pfn = dax_to_pfn(entry); \
 			pfn < dax_end_pfn(entry); pfn++)
 
+static inline bool dax_mapping_is_cow(struct address_space *mapping)
+{
+	return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
 /*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static inline void dax_mapping_set_cow(struct page *page)
+{
+	if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+		/*
+		 * Reset the index if the page was already mapped
+		 * regularly before.
+		 */
+		if (page->mapping)
+			page->index = 1;
+		page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+	}
+	page->index++;
+}
+
+/*
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files.  If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
  */
 static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address)
+		struct vm_area_struct *vma, unsigned long address, bool cow)
 {
 	unsigned long size = dax_entry_size(entry), pfn, index;
 	int i = 0;
@@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
 	for_each_mapped_pfn(entry, pfn) {
 		struct page *page = pfn_to_page(pfn);
 
-		WARN_ON_ONCE(page->mapping);
-		page->mapping = mapping;
-		page->index = index + i++;
+		if (cow) {
+			dax_mapping_set_cow(page);
+		} else {
+			WARN_ON_ONCE(page->mapping);
+			page->mapping = mapping;
+			page->index = index + i++;
+		}
 	}
 }
 
@@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 		struct page *page = pfn_to_page(pfn);
 
 		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+		if (dax_mapping_is_cow(page->mapping)) {
+			/* keep the CoW flag if this page is still shared */
+			if (page->index-- > 0)
+				continue;
+		} else
+			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
 		page->mapping = NULL;
 		page->index = 0;
 	}
@@ -830,7 +861,8 @@ static void *dax_insert_entry(struct xa_state *xas,
 		void *old;
 
 		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
+		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+				false);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 82719d33c0f1..f2ff65f1bf83 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -661,6 +661,12 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
+/*
+ * Different with flags above, this flag is used only for fsdax mode.  It
+ * indicates that this page->mapping is now under reflink case.
+ */
+#define PAGE_MAPPING_DAX_COW	0x1
+
 static __always_inline bool folio_mapping_flags(struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
-- 
cgit v1.2.3


From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001
From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date: Fri, 3 Jun 2022 13:37:36 +0800
Subject: fsdax: dedup file range to use a compare function

With dax we cannot deal with readpage() etc.  So, we create a dax
comparison function which is similar with vfs_dedupe_file_range_compare().
And introduce dax_remap_file_range_prep() for filesystem use.

Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.wiliams@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Goldwyn Rodrigues <rgoldwyn@suse.de>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
Cc: Ritesh Harjani <riteshh@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c             | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/remap_range.c     | 31 ++++++++++++++++----
 fs/xfs/xfs_reflink.c |  8 +++--
 include/linux/dax.h  |  8 +++++
 include/linux/fs.h   | 12 +++++---
 5 files changed, 130 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 0aab32300531..e0f9c4a0a0c1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 	return dax_insert_pfn_mkwrite(vmf, pfn, order);
 }
 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+		struct iomap_iter *it_dest, u64 len, bool *same)
+{
+	const struct iomap *smap = &it_src->iomap;
+	const struct iomap *dmap = &it_dest->iomap;
+	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+	void *saddr, *daddr;
+	int id, ret;
+
+	len = min(len, min(smap->length, dmap->length));
+
+	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+		*same = true;
+		return len;
+	}
+
+	if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+		*same = false;
+		return 0;
+	}
+
+	id = dax_read_lock();
+	ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+				      &saddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+				      &daddr, NULL);
+	if (ret < 0)
+		goto out_unlock;
+
+	*same = !memcmp(saddr, daddr, len);
+	if (!*same)
+		len = 0;
+	dax_read_unlock(id);
+	return len;
+
+out_unlock:
+	dax_read_unlock(id);
+	return -EIO;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+		struct inode *dst, loff_t dstoff, loff_t len, bool *same,
+		const struct iomap_ops *ops)
+{
+	struct iomap_iter src_iter = {
+		.inode		= src,
+		.pos		= srcoff,
+		.len		= len,
+		.flags		= IOMAP_DAX,
+	};
+	struct iomap_iter dst_iter = {
+		.inode		= dst,
+		.pos		= dstoff,
+		.len		= len,
+		.flags		= IOMAP_DAX,
+	};
+	int ret;
+
+	while ((ret = iomap_iter(&src_iter, ops)) > 0) {
+		while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
+			dst_iter.processed = dax_range_compare_iter(&src_iter,
+						&dst_iter, len, same);
+		}
+		if (ret <= 0)
+			src_iter.processed = ret;
+	}
+	return ret;
+}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      loff_t *len, unsigned int remap_flags,
+			      const struct iomap_ops *ops)
+{
+	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+					       pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..231de627c1b9 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/dax.h>
 #include "internal.h"
 
 #include <linux/uaccess.h>
@@ -271,9 +272,11 @@ out_error:
  * If there's an error, then the usual negative error code is returned.
  * Otherwise returns 0 with *len set to the request length.
  */
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-				  struct file *file_out, loff_t pos_out,
-				  loff_t *len, unsigned int remap_flags)
+int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out,
+				loff_t *len, unsigned int remap_flags,
+				const struct iomap_ops *dax_read_ops)
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
@@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	if (remap_flags & REMAP_FILE_DEDUP) {
 		bool		is_same = false;
 
-		ret = vfs_dedupe_file_range_compare(file_in, pos_in,
-				file_out, pos_out, *len, &is_same);
+		if (*len == 0)
+			return 0;
+
+		if (!IS_DAX(inode_in))
+			ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+					file_out, pos_out, *len, &is_same);
+		else if (dax_read_ops)
+			ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+					inode_out, pos_out, *len, &is_same,
+					dax_read_ops);
+		else
+			return -EINVAL;
 		if (ret)
 			return ret;
 		if (!is_same)
@@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	return ret;
 }
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  loff_t *len, unsigned int remap_flags)
+{
+	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+					       pos_out, len, remap_flags, NULL);
+}
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e7a7c00d93be..cbaf36d21020 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
 		goto out_unlock;
 
-	ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
-			len, remap_flags);
+	if (!IS_DAX(inode_in))
+		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags);
+	else
+		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+				pos_out, len, remap_flags, &xfs_read_iomap_ops);
 	if (ret || *len == 0)
 		goto out_unlock;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7116681b48c0..ba985333e26b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+				  struct inode *dest, loff_t destoff,
+				  loff_t len, bool *is_same,
+				  const struct iomap_ops *ops);
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      loff_t *len, unsigned int remap_flags,
+			      const struct iomap_ops *ops);
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..134e9d7ad5d6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,6 +74,7 @@ struct fsverity_operations;
 struct fs_context;
 struct fs_parameter_spec;
 struct fileattr;
+struct iomap_ops;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
 				       struct file *file_out, loff_t pos_out,
 				       size_t len, unsigned int flags);
-extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
-					 struct file *file_out, loff_t pos_out,
-					 loff_t *count,
-					 unsigned int remap_flags);
+int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    loff_t *len, unsigned int remap_flags,
+				    const struct iomap_ops *dax_read_ops);
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+				  struct file *file_out, loff_t pos_out,
+				  loff_t *count, unsigned int remap_flags);
 extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t len, unsigned int remap_flags);
-- 
cgit v1.2.3


From 4fa6893faeaaea4fe4440512d2a708527ef47051 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:35 -0700
Subject: mm: thp: consolidate vma size check to transhuge_vma_suitable

There are couple of places that check whether the vma size is ok for THP
or whether address fits, they are open coded and duplicate, use
transhuge_vma_suitable() to do the job by passing in (vma->end -
HPAGE_PMD_SIZE).

Move vma size check into hugepage_vma_check().  This will make
khugepaged_enter() is as same as khugepaged_enter_vma().  There is just
one caller for khugepaged_enter(), replace it to khugepaged_enter_vma()
and remove khugepaged_enter().

Link: https://lkml.kernel.org/r/20220616174840.1202070-3-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    | 11 +++++++++++
 include/linux/khugepaged.h | 14 --------------
 mm/huge_memory.c           |  2 +-
 mm/khugepaged.c            | 19 ++++++-------------
 4 files changed, 18 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 648cb3ce7099..8a5a8bfce0f5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -116,6 +116,17 @@ extern struct kobj_attribute shmem_enabled_attr;
 
 extern unsigned long transparent_hugepage_flags;
 
+/*
+ * Do the below checks:
+ *   - For file vma, check if the linear page offset of vma is
+ *     HPAGE_PMD_NR aligned within the file.  The hugepage is
+ *     guaranteed to be hugepage-aligned within the file, but we must
+ *     check that the PMD-aligned addresses in the VMA map to
+ *     PMD-aligned offsets within the file, else the hugepage will
+ *     not be PMD-mappable.
+ *   - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ *     area.
+ */
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 392d34c3c59a..31ca8a7f78f4 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -51,16 +51,6 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 	if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
 		__khugepaged_exit(mm);
 }
-
-static inline void khugepaged_enter(struct vm_area_struct *vma,
-				   unsigned long vm_flags)
-{
-	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags))
-			__khugepaged_enter(vma->vm_mm);
-	}
-}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
@@ -68,10 +58,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
 static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
-static inline void khugepaged_enter(struct vm_area_struct *vma,
-				    unsigned long vm_flags)
-{
-}
 static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
 					unsigned long vm_flags)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a563de8234c1..2751649aaf33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -726,7 +726,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
-	khugepaged_enter(vma, vma->vm_flags);
+	khugepaged_enter_vma(vma, vma->vm_flags);
 
 	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 			!mm_forbids_zeropage(vma->vm_mm) &&
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3eec970a884d..c7e22135f1b5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -443,8 +443,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	if (vma_is_dax(vma))
 		return false;
 
-	if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
-				vma->vm_pgoff, HPAGE_PMD_NR))
+	/* Check alignment for file vma and size for both file and anon vma */
+	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
 		return false;
 
 	/* Enabled via shmem mount options or sysfs settings. */
@@ -505,9 +505,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled() &&
-	    (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
-	     (vma->vm_end & HPAGE_PMD_MASK))) {
+	    khugepaged_enabled()) {
 		if (hugepage_vma_check(vma, vm_flags))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -948,7 +946,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 		struct vm_area_struct **vmap)
 {
 	struct vm_area_struct *vma;
-	unsigned long hstart, hend;
 
 	if (unlikely(khugepaged_test_exit(mm)))
 		return SCAN_ANY_PROCESS;
@@ -957,9 +954,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 	if (!vma)
 		return SCAN_VMA_NULL;
 
-	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
-	hend = vma->vm_end & HPAGE_PMD_MASK;
-	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
 	if (!hugepage_vma_check(vma, vma->vm_flags))
 		return SCAN_VMA_CHECK;
@@ -2135,10 +2130,8 @@ skip:
 			progress++;
 			continue;
 		}
-		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
-		hend = vma->vm_end & HPAGE_PMD_MASK;
-		if (hstart >= hend)
-			goto skip;
+		hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
+		hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
 		if (khugepaged_scan.address > hend)
 			goto skip;
 		if (khugepaged_scan.address < hstart)
-- 
cgit v1.2.3


From 9fec51689ff60d9766b38051a0b1692f93d95364 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:37 -0700
Subject: mm: thp: kill transparent_hugepage_active()

The transparent_hugepage_active() was introduced to show THP eligibility
bit in smaps in proc, smaps is the only user.  But it actually does the
similar check as hugepage_vma_check() which is used by khugepaged.  We
definitely don't have to maintain two similar checks, so kill
transparent_hugepage_active().

This patch also fixed the wrong behavior for VM_NO_KHUGEPAGED vmas.

Also move hugepage_vma_check() to huge_memory.c and huge_mm.h since it
is not only for khugepaged anymore.

[akpm@linux-foundation.org: check vma->vm_mm, per Zach]
[akpm@linux-foundation.org: add comment to vdso check]
Link: https://lkml.kernel.org/r/20220616174840.1202070-5-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c         |  2 +-
 include/linux/huge_mm.h    | 16 ++++++++------
 include/linux/khugepaged.h |  2 --
 mm/huge_memory.c           | 53 ++++++++++++++++++++++++++++++++++++++--------
 mm/khugepaged.c            | 48 ++++-------------------------------------
 5 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 1d7fd832123b..072cf770b5d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n",
-		   transparent_hugepage_active(vma));
+		   hugepage_vma_check(vma, vma->vm_flags, true));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8a5a8bfce0f5..64487bcd0c7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -202,7 +202,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	       !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
-bool transparent_hugepage_active(struct vm_area_struct *vma);
+bool hugepage_vma_check(struct vm_area_struct *vma,
+			unsigned long vm_flags,
+			bool smaps);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -351,11 +353,6 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
-static inline bool transparent_hugepage_active(struct vm_area_struct *vma)
-{
-	return false;
-}
-
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
@@ -368,6 +365,13 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
 	return false;
 }
 
+static inline bool hugepage_vma_check(struct vm_area_struct *vma,
+				       unsigned long vm_flags,
+				       bool smaps)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 31ca8a7f78f4..ea5fd4c398f7 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group;
 extern int khugepaged_init(void);
 extern void khugepaged_destroy(void);
 extern int start_stop_khugepaged(void);
-extern bool hugepage_vma_check(struct vm_area_struct *vma,
-			       unsigned long vm_flags);
 extern void __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern void khugepaged_enter_vma(struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2751649aaf33..8cbd21aaf03e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -69,21 +69,56 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
-bool transparent_hugepage_active(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+			unsigned long vm_flags,
+			bool smaps)
 {
-	/* The addr is used to check if the vma size fits */
-	unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+	if (!vma->vm_mm)		/* vdso */
+		return false;
+
+	if (!transhuge_vma_enabled(vma, vm_flags))
+		return false;
 
-	if (!transhuge_vma_suitable(vma, addr))
+	if (vm_flags & VM_NO_KHUGEPAGED)
 		return false;
-	if (vma_is_anonymous(vma))
-		return __transparent_hugepage_enabled(vma);
-	if (vma_is_shmem(vma))
+
+	/* Don't run khugepaged against DAX vma */
+	if (vma_is_dax(vma))
+		return false;
+
+	/* Check alignment for file vma and size for both file and anon vma */
+	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+		return false;
+
+	/* Enabled via shmem mount options or sysfs settings. */
+	if (shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
-	if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+
+	if (!khugepaged_enabled())
+		return false;
+
+	/* THP settings require madvise. */
+	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		return false;
+
+	/* Only regular file is valid */
+	if (file_thp_enabled(vma))
 		return true;
 
-	return false;
+	if (!vma_is_anonymous(vma))
+		return false;
+
+	if (vma_is_temporary_stack(vma))
+		return false;
+
+	/*
+	 * THPeligible bit of smaps should show 1 for proper VMAs even
+	 * though anon_vma is not initialized yet.
+	 */
+	if (!vma->anon_vma)
+		return smaps;
+
+	return true;
 }
 
 static bool get_huge_zero_page(void)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 67e144e64b7f..6bbf3adac534 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -430,46 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
-bool hugepage_vma_check(struct vm_area_struct *vma,
-			unsigned long vm_flags)
-{
-	if (!transhuge_vma_enabled(vma, vm_flags))
-		return false;
-
-	if (vm_flags & VM_NO_KHUGEPAGED)
-		return false;
-
-	/* Don't run khugepaged against DAX vma */
-	if (vma_is_dax(vma))
-		return false;
-
-	/* Check alignment for file vma and size for both file and anon vma */
-	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
-		return false;
-
-	/* Enabled via shmem mount options or sysfs settings. */
-	if (shmem_file(vma->vm_file))
-		return shmem_huge_enabled(vma);
-
-	if (!khugepaged_enabled())
-		return false;
-
-	/* THP settings require madvise. */
-	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
-		return false;
-
-	/* Only regular file is valid */
-	if (file_thp_enabled(vma))
-		return true;
-
-	if (!vma->anon_vma || !vma_is_anonymous(vma))
-		return false;
-	if (vma_is_temporary_stack(vma))
-		return false;
-
-	return true;
-}
-
 void __khugepaged_enter(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
@@ -506,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags))
+		if (hugepage_vma_check(vma, vm_flags, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -956,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
 	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags))
+	if (!hugepage_vma_check(vma, vma->vm_flags, false))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
@@ -1441,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
 	 * will not fail the vma for missing VM_HUGEPAGE
 	 */
-	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false))
 		return;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2131,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			progress++;
 			break;
 		}
-		if (!hugepage_vma_check(vma, vma->vm_flags)) {
+		if (!hugepage_vma_check(vma, vma->vm_flags, false)) {
 skip:
 			progress++;
 			continue;
-- 
cgit v1.2.3


From 7da4e2cb8b1ff8221759bfc7512d651ee69516dc Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:38 -0700
Subject: mm: thp: kill __transhuge_page_enabled()

The page fault path checks THP eligibility with __transhuge_page_enabled()
which does the similar thing as hugepage_vma_check(), so use
hugepage_vma_check() instead.

However page fault allows DAX and !anon_vma cases, so added a new flag,
in_pf, to hugepage_vma_check() to make page fault work correctly.

The in_pf flag is also used to skip shmem and file THP for page fault
since shmem handles THP in its own shmem_fault() and file THP allocation
on fault is not supported yet.

Also remove hugepage_vma_enabled() since hugepage_vma_check() is the only
caller now, it is not necessary to have a helper function.

Link: https://lkml.kernel.org/r/20220616174840.1202070-6-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  2 +-
 include/linux/huge_mm.h | 57 ++-----------------------------------------------
 mm/huge_memory.c        | 51 +++++++++++++++++++++++++++++++++----------
 mm/khugepaged.c         |  8 +++----
 mm/memory.c             |  7 ++++--
 5 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 072cf770b5d0..a3398d0f1927 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n",
-		   hugepage_vma_check(vma, vma->vm_flags, true));
+		   hugepage_vma_check(vma, vma->vm_flags, true, false));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 64487bcd0c7b..cd8a6c5d9fe5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -146,48 +146,6 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 	return true;
 }
 
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
-					  unsigned long vm_flags)
-{
-	/* Explicitly disabled through madvise. */
-	if ((vm_flags & VM_NOHUGEPAGE) ||
-	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-		return false;
-	return true;
-}
-
-/*
- * to be used on vmas which are known to support THP.
- * Use transparent_hugepage_active otherwise
- */
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-
-	/*
-	 * If the hardware/firmware marked hugepage support disabled.
-	 */
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
-		return false;
-
-	if (!transhuge_vma_enabled(vma, vma->vm_flags))
-		return false;
-
-	if (vma_is_temporary_stack(vma))
-		return false;
-
-	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
-		return true;
-
-	if (vma_is_dax(vma))
-		return true;
-
-	if (transparent_hugepage_flags &
-				(1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
-		return !!(vma->vm_flags & VM_HUGEPAGE);
-
-	return false;
-}
-
 static inline bool file_thp_enabled(struct vm_area_struct *vma)
 {
 	struct inode *inode;
@@ -204,7 +162,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
-			bool smaps);
+			bool smaps, bool in_pf);
 
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
@@ -348,26 +306,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return false;
 }
 
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-	return false;
-}
-
 static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 		unsigned long addr)
 {
 	return false;
 }
 
-static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
-					  unsigned long vm_flags)
-{
-	return false;
-}
-
 static inline bool hugepage_vma_check(struct vm_area_struct *vma,
 				       unsigned long vm_flags,
-				       bool smaps)
+				       bool smaps, bool in_pf)
 {
 	return false;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cbd21aaf03e..4b90c7021e52 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -71,27 +71,53 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
 bool hugepage_vma_check(struct vm_area_struct *vma,
 			unsigned long vm_flags,
-			bool smaps)
+			bool smaps, bool in_pf)
 {
 	if (!vma->vm_mm)		/* vdso */
 		return false;
 
-	if (!transhuge_vma_enabled(vma, vm_flags))
+	/*
+	 * Explicitly disabled through madvise or prctl, or some
+	 * architectures may disable THP for some mappings, for
+	 * example, s390 kvm.
+	 * */
+	if ((vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
-
-	if (vm_flags & VM_NO_KHUGEPAGED)
+	/*
+	 * If the hardware/firmware marked hugepage support disabled.
+	 */
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
 		return false;
 
-	/* Don't run khugepaged against DAX vma */
+	/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
 	if (vma_is_dax(vma))
+		return in_pf;
+
+	/*
+	 * Special VMA and hugetlb VMA.
+	 * Must be checked after dax since some dax mappings may have
+	 * VM_MIXEDMAP set.
+	 */
+	if (vm_flags & VM_NO_KHUGEPAGED)
 		return false;
 
-	/* Check alignment for file vma and size for both file and anon vma */
-	if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
+	/*
+	 * Check alignment for file vma and size for both file and anon vma.
+	 *
+	 * Skip the check for page fault. Huge fault does the check in fault
+	 * handlers. And this check is not suitable for huge PUD fault.
+	 */
+	if (!in_pf &&
+	    !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
 		return false;
 
-	/* Enabled via shmem mount options or sysfs settings. */
-	if (shmem_file(vma->vm_file))
+	/*
+	 * Enabled via shmem mount options or sysfs settings.
+	 * Must be done before hugepage flags check since shmem has its
+	 * own flags.
+	 */
+	if (!in_pf && shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
 
 	if (!khugepaged_enabled())
@@ -102,7 +128,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 		return false;
 
 	/* Only regular file is valid */
-	if (file_thp_enabled(vma))
+	if (!in_pf && file_thp_enabled(vma))
 		return true;
 
 	if (!vma_is_anonymous(vma))
@@ -114,9 +140,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	/*
 	 * THPeligible bit of smaps should show 1 for proper VMAs even
 	 * though anon_vma is not initialized yet.
+	 *
+	 * Allow page fault since anon_vma may be not initialized until
+	 * the first page fault.
 	 */
 	if (!vma->anon_vma)
-		return smaps;
+		return (smaps || in_pf);
 
 	return true;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6bbf3adac534..d683ef1edeb5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -466,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    khugepaged_enabled()) {
-		if (hugepage_vma_check(vma, vm_flags, false))
+		if (hugepage_vma_check(vma, vm_flags, false, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
 }
@@ -916,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
 	if (!transhuge_vma_suitable(vma, address))
 		return SCAN_ADDRESS_RANGE;
-	if (!hugepage_vma_check(vma, vma->vm_flags, false))
+	if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
@@ -1401,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
 	 * will not fail the vma for missing VM_HUGEPAGE
 	 */
-	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false))
+	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
 		return;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2091,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 			progress++;
 			break;
 		}
-		if (!hugepage_vma_check(vma, vma->vm_flags, false)) {
+		if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
 skip:
 			progress++;
 			continue;
diff --git a/mm/memory.c b/mm/memory.c
index dce0b2e686eb..2392d5db473a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4970,6 +4970,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		.gfp_mask = __get_fault_gfp_mask(vma),
 	};
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long vm_flags = vma->vm_flags;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	vm_fault_t ret;
@@ -4983,7 +4984,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 retry_pud:
-	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+	if (pud_none(*vmf.pud) &&
+	    hugepage_vma_check(vma, vm_flags, false, true)) {
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -5016,7 +5018,8 @@ retry_pud:
 	if (pud_trans_unstable(vmf.pud))
 		goto retry_pud;
 
-	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+	if (pmd_none(*vmf.pmd) &&
+	    hugepage_vma_check(vma, vm_flags, false, true)) {
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-- 
cgit v1.2.3


From 1064026bab9f011bdea1251d44d66bbbcee04f6e Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 16 Jun 2022 10:48:39 -0700
Subject: mm: khugepaged: reorg some khugepaged helpers

The khugepaged_{enabled|always|req_madv} are not khugepaged only anymore,
move them to huge_mm.h and rename to hugepage_flags_xxx, and remove
khugepaged_req_madv due to no users.

Also move khugepaged_defrag to khugepaged.c since its only caller is in
that file, it doesn't have to be in a header file.

Link: https://lkml.kernel.org/r/20220616174840.1202070-7-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: Zach O'Keefe <zokeefe@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    |  8 ++++++++
 include/linux/khugepaged.h | 14 --------------
 mm/huge_memory.c           |  4 ++--
 mm/khugepaged.c            | 18 +++++++++++-------
 4 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cd8a6c5d9fe5..ae3d8e2fd9e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -116,6 +116,14 @@ extern struct kobj_attribute shmem_enabled_attr;
 
 extern unsigned long transparent_hugepage_flags;
 
+#define hugepage_flags_enabled()					       \
+	(transparent_hugepage_flags &				       \
+	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
+	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define hugepage_flags_always()				\
+	(transparent_hugepage_flags &			\
+	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+
 /*
  * Do the below checks:
  *   - For file vma, check if the linear page offset of vma is
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index ea5fd4c398f7..384f034ae947 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -24,20 +24,6 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
 }
 #endif
 
-#define khugepaged_enabled()					       \
-	(transparent_hugepage_flags &				       \
-	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
-	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define khugepaged_always()				\
-	(transparent_hugepage_flags &			\
-	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
-#define khugepaged_req_madv()					\
-	(transparent_hugepage_flags &				\
-	 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
-#define khugepaged_defrag()					\
-	(transparent_hugepage_flags &				\
-	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
-
 static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4b90c7021e52..8e1b3d9f7ebf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -120,11 +120,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
 	if (!in_pf && shmem_file(vma->vm_file))
 		return shmem_huge_enabled(vma);
 
-	if (!khugepaged_enabled())
+	if (!hugepage_flags_enabled())
 		return false;
 
 	/* THP settings require madvise. */
-	if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+	if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
 		return false;
 
 	/* Only regular file is valid */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d683ef1edeb5..01f71786d530 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -465,7 +465,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 			  unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
-	    khugepaged_enabled()) {
+	    hugepage_flags_enabled()) {
 		if (hugepage_vma_check(vma, vm_flags, false, false))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -761,6 +761,10 @@ static bool khugepaged_scan_abort(int nid)
 	return false;
 }
 
+#define khugepaged_defrag()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 {
@@ -858,7 +862,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
 			khugepaged_alloc_sleep();
 		} else
 			count_vm_event(THP_COLLAPSE_ALLOC);
-	} while (unlikely(!hpage) && likely(khugepaged_enabled()));
+	} while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
 
 	return hpage;
 }
@@ -2172,7 +2176,7 @@ breakouterloop_mmap_lock:
 static int khugepaged_has_work(void)
 {
 	return !list_empty(&khugepaged_scan.mm_head) &&
-		khugepaged_enabled();
+		hugepage_flags_enabled();
 }
 
 static int khugepaged_wait_event(void)
@@ -2237,7 +2241,7 @@ static void khugepaged_wait_work(void)
 		return;
 	}
 
-	if (khugepaged_enabled())
+	if (hugepage_flags_enabled())
 		wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
@@ -2268,7 +2272,7 @@ static void set_recommended_min_free_kbytes(void)
 	int nr_zones = 0;
 	unsigned long recommended_min;
 
-	if (!khugepaged_enabled()) {
+	if (!hugepage_flags_enabled()) {
 		calculate_min_free_kbytes();
 		goto update_wmarks;
 	}
@@ -2318,7 +2322,7 @@ int start_stop_khugepaged(void)
 	int err = 0;
 
 	mutex_lock(&khugepaged_mutex);
-	if (khugepaged_enabled()) {
+	if (hugepage_flags_enabled()) {
 		if (!khugepaged_thread)
 			khugepaged_thread = kthread_run(khugepaged, NULL,
 							"khugepaged");
@@ -2344,7 +2348,7 @@ fail:
 void khugepaged_min_free_kbytes_update(void)
 {
 	mutex_lock(&khugepaged_mutex);
-	if (khugepaged_enabled() && khugepaged_thread)
+	if (hugepage_flags_enabled() && khugepaged_thread)
 		set_recommended_min_free_kbytes();
 	mutex_unlock(&khugepaged_mutex);
 }
-- 
cgit v1.2.3


From e95a9851787bbb3cd4deb40fe8bab03f731852d1 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 21 Jun 2022 16:56:17 -0700
Subject: hugetlb: skip to end of PT page mapping when pte not present

Patch series "hugetlb: speed up linear address scanning", v2.

At unmap, fork and remap time hugetlb address ranges are linearly scanned.
We can optimize these scans if the ranges are sparsely populated.

Also, enable page table "Lazy copy" for hugetlb at fork.

NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to
add an arch specific version hugetlb_mask_last_page() to take advantage of
sparse address scanning improvements.  Baolin Wang added the routine for
arm64.  Other architectures which could be optimized are: ia64, mips,
parisc, powerpc, s390, sh and sparc.


This patch (of 4):

HugeTLB address ranges are linearly scanned during fork, unmap and remap
operations.  If a non-present entry is encountered, the code currently
continues to the next huge page aligned address.  However, a non-present
entry implies that the page table page for that entry is not present.
Therefore, the linear scan can skip to the end of range mapped by the page
table page.  This can speed operations on large sparsely populated hugetlb
mappings.

Create a new routine hugetlb_mask_last_page() that will return an address
mask.  When the mask is ORed with an address, the result will be the
address of the last huge page mapped by the associated page table page.
Use this mask to update addresses in routines which linearly scan hugetlb
address ranges when a non-present pte is encountered.

hugetlb_mask_last_page is related to the implementation of huge_pte_offset
as hugetlb_mask_last_page is called when huge_pte_offset returns NULL.
This patch only provides a complete hugetlb_mask_last_page implementation
when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined.  Architectures which
provide their own versions of huge_pte_offset can also provide their own
version of hugetlb_mask_last_page.

Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: James Houghton <jthoughton@google.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  1 +
 mm/hugetlb.c            | 56 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c6cccfaf8708..ce30fad5fd13 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
+unsigned long hugetlb_mask_last_page(struct hstate *h);
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long *addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ffdf3fc4a83f..95fd1c36c17f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4727,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long npages = pages_per_huge_page(h);
 	struct address_space *mapping = src_vma->vm_file->f_mapping;
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	int ret = 0;
 
 	if (cow) {
@@ -4746,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		i_mmap_lock_read(mapping);
 	}
 
+	last_addr_mask = hugetlb_mask_last_page(h);
 	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
 		src_pte = huge_pte_offset(src, addr, sz);
-		if (!src_pte)
+		if (!src_pte) {
+			addr |= last_addr_mask;
 			continue;
+		}
 		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
 		if (!dst_pte) {
 			ret = -ENOMEM;
@@ -4767,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		 * after taking the lock below.
 		 */
 		dst_entry = huge_ptep_get(dst_pte);
-		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+			addr |= last_addr_mask;
 			continue;
+		}
 
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
 		src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4928,6 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	unsigned long sz = huge_page_size(h);
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long old_end = old_addr + len;
+	unsigned long last_addr_mask;
 	unsigned long old_addr_copy;
 	pte_t *src_pte, *dst_pte;
 	struct mmu_notifier_range range;
@@ -4943,12 +4950,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	/* Prevent race with file truncation */
 	i_mmap_lock_write(mapping);
 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
 		src_pte = huge_pte_offset(mm, old_addr, sz);
-		if (!src_pte)
+		if (!src_pte) {
+			old_addr |= last_addr_mask;
+			new_addr |= last_addr_mask;
 			continue;
+		}
 		if (huge_pte_none(huge_ptep_get(src_pte)))
 			continue;
 
@@ -4993,6 +5004,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 	struct hstate *h = hstate_vma(vma);
 	unsigned long sz = huge_page_size(h);
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	bool force_flush = false;
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5013,11 +5025,14 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 				end);
 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	address = start;
 	for (; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address, sz);
-		if (!ptep)
+		if (!ptep) {
+			address |= last_addr_mask;
 			continue;
+		}
 
 		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
@@ -6285,6 +6300,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	unsigned long pages = 0, psize = huge_page_size(h);
 	bool shared_pmd = false;
 	struct mmu_notifier_range range;
+	unsigned long last_addr_mask;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
@@ -6301,12 +6317,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += psize) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address, psize);
-		if (!ptep)
+		if (!ptep) {
+			address |= last_addr_mask;
 			continue;
+		}
 		ptl = huge_pte_lock(h, mm, ptep);
 		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
 			/*
@@ -6856,6 +6875,33 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	return (pte_t *)pmd;
 }
 
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size.  Used to skip non-present
+ * page table entries when linearly scanning address ranges.  Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+	unsigned long hp_size = huge_page_size(h);
+
+	if (hp_size == PUD_SIZE)
+		return P4D_SIZE - PUD_SIZE;
+	else if (hp_size == PMD_SIZE)
+		return PUD_SIZE - PMD_SIZE;
+	else
+		return 0UL;
+}
+
+#else
+
+/* See description above.  Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+	return 0UL;
+}
+
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 
 /*
-- 
cgit v1.2.3


From 4ddb4d91b82f4b64458fe35bc8e395c7c082ea2b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 21 Jun 2022 16:56:19 -0700
Subject: hugetlb: do not update address in huge_pmd_unshare

As an optimization for loops sequentially processing hugetlb address
ranges, huge_pmd_unshare would update a passed address if it unshared a
pmd.  Updating a loop control variable outside the loop like this is
generally a bad idea.  These loops are now using hugetlb_mask_last_page to
optimize scanning when non-present ptes are discovered.  The same can be
done when huge_pmd_unshare returns 1 indicating a pmd was unshared.

Remove address update from huge_pmd_unshare.  Change the passed argument
type and update all callers.  In loops sequentially processing addresses
use hugetlb_mask_last_page to update address if pmd is unshared.

[sfr@canb.auug.org.au: fix an unused variable warning/error]
  Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au
Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: Will Deacon <will@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 44 +++++++++++++++++---------------------------
 mm/rmap.c               |  4 ++--
 3 files changed, 21 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ce30fad5fd13..75ee739d815b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -196,7 +196,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
 unsigned long hugetlb_mask_last_page(struct hstate *h);
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long *addr, pte_t *ptep);
+				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -243,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write(
 
 static inline int huge_pmd_unshare(struct mm_struct *mm,
 					struct vm_area_struct *vma,
-					unsigned long *addr, pte_t *ptep)
+					unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 95fd1c36c17f..96635a2874e3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4935,7 +4935,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long old_end = old_addr + len;
 	unsigned long last_addr_mask;
-	unsigned long old_addr_copy;
 	pte_t *src_pte, *dst_pte;
 	struct mmu_notifier_range range;
 	bool shared_pmd = false;
@@ -4963,14 +4962,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 		if (huge_pte_none(huge_ptep_get(src_pte)))
 			continue;
 
-		/* old_addr arg to huge_pmd_unshare() is a pointer and so the
-		 * arg may be modified. Pass a copy instead to preserve the
-		 * value in old_addr.
-		 */
-		old_addr_copy = old_addr;
-
-		if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+		if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
 			shared_pmd = true;
+			old_addr |= last_addr_mask;
+			new_addr |= last_addr_mask;
 			continue;
 		}
 
@@ -5035,10 +5030,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 		}
 
 		ptl = huge_pte_lock(h, mm, ptep);
-		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+		if (huge_pmd_unshare(mm, vma, address, ptep)) {
 			spin_unlock(ptl);
 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
 			force_flush = true;
+			address |= last_addr_mask;
 			continue;
 		}
 
@@ -6327,7 +6323,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			continue;
 		}
 		ptl = huge_pte_lock(h, mm, ptep);
-		if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+		if (huge_pmd_unshare(mm, vma, address, ptep)) {
 			/*
 			 * When uffd-wp is enabled on the vma, unshare
 			 * shouldn't happen at all.  Warn about it if it
@@ -6337,6 +6333,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 			pages++;
 			spin_unlock(ptl);
 			shared_pmd = true;
+			address |= last_addr_mask;
 			continue;
 		}
 		pte = huge_ptep_get(ptep);
@@ -6759,11 +6756,11 @@ out:
  *	    0 the underlying pte page is not shared, or it is the last user
  */
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-					unsigned long *addr, pte_t *ptep)
+					unsigned long addr, pte_t *ptep)
 {
-	pgd_t *pgd = pgd_offset(mm, *addr);
-	p4d_t *p4d = p4d_offset(pgd, *addr);
-	pud_t *pud = pud_offset(p4d, *addr);
+	pgd_t *pgd = pgd_offset(mm, addr);
+	p4d_t *p4d = p4d_offset(pgd, addr);
+	pud_t *pud = pud_offset(p4d, addr);
 
 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6773,14 +6770,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_clear(pud);
 	put_page(virt_to_page(ptep));
 	mm_dec_nr_pmds(mm);
-	/*
-	 * This update of passed address optimizes loops sequentially
-	 * processing addresses in increments of huge page size (PMD_SIZE
-	 * in this case).  By clearing the pud, a PUD_SIZE area is unmapped.
-	 * Update address to the 'last page' in the cleared area so that
-	 * calling loop can move to first page past this area.
-	 */
-	*addr |= PUD_SIZE - PMD_SIZE;
 	return 1;
 }
 
@@ -6792,7 +6781,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
-				unsigned long *addr, pte_t *ptep)
+				unsigned long addr, pte_t *ptep)
 {
 	return 0;
 }
@@ -6899,6 +6888,10 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
 /* See description above.  Architectures can provide their own version. */
 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
 {
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+	if (huge_page_size(h) == PMD_SIZE)
+		return PUD_SIZE - PMD_SIZE;
+#endif
 	return 0UL;
 }
 
@@ -7125,14 +7118,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	mmu_notifier_invalidate_range_start(&range);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (address = start; address < end; address += PUD_SIZE) {
-		unsigned long tmp = address;
-
 		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
 		ptl = huge_pte_lock(h, mm, ptep);
-		/* We don't want 'address' to be changed */
-		huge_pmd_unshare(mm, vma, &tmp, ptep);
+		huge_pmd_unshare(mm, vma, address, ptep);
 		spin_unlock(ptl);
 	}
 	flush_hugetlb_tlb_range(vma, start, end);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0532fd92ecb3..fb6b3b47f3e4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1559,7 +1559,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * do this outside rmap routines.
 			 */
 			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
 				flush_tlb_range(vma, range.start, range.end);
 				mmu_notifier_invalidate_range(mm, range.start,
 							      range.end);
@@ -1920,7 +1920,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * do this outside rmap routines.
 			 */
 			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
 				flush_tlb_range(vma, range.start, range.end);
 				mmu_notifier_invalidate_range(mm, range.start,
 							      range.end);
-- 
cgit v1.2.3


From bf75f200569dd05ac2112797f44548beb6b4be26 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:17 +0100
Subject: mm/page_alloc: add page->buddy_list and page->pcp_list

Patch series "Drain remote per-cpu directly", v5.

Some setups, notably NOHZ_FULL CPUs, may be running realtime or
latency-sensitive applications that cannot tolerate interference due to
per-cpu drain work queued by __drain_all_pages().  Introduce a new
mechanism to remotely drain the per-cpu lists.  It is made possible by
remotely locking 'struct per_cpu_pages' new per-cpu spinlocks.  This has
two advantages, the time to drain is more predictable and other unrelated
tasks are not interrupted.

This series has the same intent as Nicolas' series "mm/page_alloc: Remote
per-cpu lists drain support" -- avoid interference of a high priority task
due to a workqueue item draining per-cpu page lists.  While many workloads
can tolerate a brief interruption, it may cause a real-time task running
on a NOHZ_FULL CPU to miss a deadline and at minimum, the draining is
non-deterministic.

Currently an IRQ-safe local_lock protects the page allocator per-cpu
lists.  The local_lock on its own prevents migration and the IRQ disabling
protects from corruption due to an interrupt arriving while a page
allocation is in progress.

This series adjusts the locking.  A spinlock is added to struct
per_cpu_pages to protect the list contents while local_lock_irq is
ultimately replaced by just the spinlock in the final patch.  This allows
a remote CPU to safely.  Follow-on work should allow the spin_lock_irqsave
to be converted to spin_lock to avoid IRQs being disabled/enabled in most
cases.  The follow-on patch will be one kernel release later as it is
relatively high risk and it'll make bisections more clear if there are any
problems.

Patch 1 is a cosmetic patch to clarify when page->lru is storing buddy pages
	and when it is storing per-cpu pages.

Patch 2 shrinks per_cpu_pages to make room for a spin lock. Strictly speaking
	this is not necessary but it avoids per_cpu_pages consuming another
	cache line.

Patch 3 is a preparation patch to avoid code duplication.

Patch 4 is a minor correction.

Patch 5 uses a spin_lock to protect the per_cpu_pages contents while still
	relying on local_lock to prevent migration, stabilise the pcp
	lookup and prevent IRQ reentrancy.

Patch 6 remote drains per-cpu pages directly instead of using a workqueue.

Patch 7 uses a normal spinlock instead of local_lock for remote draining


This patch (of 7):

The page allocator uses page->lru for storing pages on either buddy or PCP
lists.  Create page->buddy_list and page->pcp_list as a union with
page->lru.  This is simply to clarify what type of list a page is on in
the page allocator.

No functional change intended.

[minchan@kernel.org: fix page lru fields in macros]
Link: https://lkml.kernel.org/r/20220624125423.6126-2-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h |  5 +++++
 mm/page_alloc.c          | 24 ++++++++++++------------
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b961a29bf26..cf97f3884fda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -87,6 +87,7 @@ struct page {
 			 */
 			union {
 				struct list_head lru;
+
 				/* Or, for the Unevictable "LRU list" slot */
 				struct {
 					/* Always even, to negate PageTail */
@@ -94,6 +95,10 @@ struct page {
 					/* Count page's or folio's mlocks */
 					unsigned int mlock_count;
 				};
+
+				/* Or, free page */
+				struct list_head buddy_list;
+				struct list_head pcp_list;
 			};
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c9c02b23f02f..78ba5ba66586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -793,7 +793,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
 		return false;
 
 	__SetPageGuard(page);
-	INIT_LIST_HEAD(&page->lru);
+	INIT_LIST_HEAD(&page->buddy_list);
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -936,7 +936,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add(&page->lru, &area->free_list[migratetype]);
+	list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
@@ -946,7 +946,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add_tail(&page->lru, &area->free_list[migratetype]);
+	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
@@ -960,7 +960,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move_tail(&page->lru, &area->free_list[migratetype]);
+	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -970,7 +970,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 	if (page_reported(page))
 		__ClearPageReported(page);
 
-	list_del(&page->lru);
+	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 	zone->free_area[order].nr_free--;
@@ -1508,11 +1508,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		do {
 			int mt;
 
-			page = list_last_entry(list, struct page, lru);
+			page = list_last_entry(list, struct page, pcp_list);
 			mt = get_pcppage_migratetype(page);
 
 			/* must delete to avoid corrupting pcp list */
-			list_del(&page->lru);
+			list_del(&page->pcp_list);
 			count -= nr_pages;
 			pcp->count -= nr_pages;
 
@@ -3072,7 +3072,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
 		 */
-		list_add_tail(&page->lru, list);
+		list_add_tail(&page->pcp_list, list);
 		allocated++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3322,7 +3322,7 @@ void mark_free_pages(struct zone *zone)
 
 	for_each_migratetype_order(order, t) {
 		list_for_each_entry(page,
-				&zone->free_area[order].free_list[t], lru) {
+				&zone->free_area[order].free_list[t], buddy_list) {
 			unsigned long i;
 
 			pfn = page_to_pfn(page);
@@ -3411,7 +3411,7 @@ static void free_unref_page_commit(struct page *page, int migratetype,
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
-	list_add(&page->lru, &pcp->lists[pindex]);
+	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
 
 	/*
@@ -3674,8 +3674,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 				return NULL;
 		}
 
-		page = list_first_entry(list, struct page, lru);
-		list_del(&page->lru);
+		page = list_first_entry(list, struct page, pcp_list);
+		list_del(&page->pcp_list);
 		pcp->count -= 1 << order;
 	} while (check_new_pcp(page, order));
 
-- 
cgit v1.2.3


From 5d0a661d808fc8ddc26940b1a12b82ae356f3ae2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:18 +0100
Subject: mm/page_alloc: use only one PCP list for THP-sized allocations

The per_cpu_pages is cache-aligned on a standard x86-64 distribution
configuration but a later patch will add a new field which would push the
structure into the next cache line.  Use only one list to store THP-sized
pages on the per-cpu list.  This assumes that the vast majority of
THP-sized allocations are GFP_MOVABLE but even if it was another type, it
would not contribute to serious fragmentation that potentially causes a
later THP allocation failure.  Align per_cpu_pages on the cacheline
boundary to ensure there is no false cache sharing.

After this patch, the structure sizing is;

struct per_cpu_pages {
        int                        count;                /*     0     4 */
        int                        high;                 /*     4     4 */
        int                        batch;                /*     8     4 */
        short int                  free_factor;          /*    12     2 */
        short int                  expire;               /*    14     2 */
        struct list_head           lists[13];            /*    16   208 */

        /* size: 256, cachelines: 4, members: 6 */
        /* padding: 32 */
} __attribute__((__aligned__(64)));

Link: https://lkml.kernel.org/r/20220624125423.6126-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 11 +++++++----
 mm/page_alloc.c        |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5da1135e6755..041136b5628a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -355,15 +355,18 @@ enum zone_watermarks {
 };
 
 /*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
- * for pageblock size for THP if configured.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
+ * for THP which will usually be GFP_MOVABLE. Even if it is another type,
+ * it should not contribute to serious fragmentation causing THP allocation
+ * failures.
  */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define NR_PCP_THP 1
 #else
 #define NR_PCP_THP 0
 #endif
-#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
+#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
 
 /*
  * Shift to encode migratetype and order in the same integer, with order
@@ -389,7 +392,7 @@ struct per_cpu_pages {
 
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
-};
+} ____cacheline_aligned_in_smp;
 
 struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 78ba5ba66586..b5c340d2cb43 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -653,7 +653,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order > PAGE_ALLOC_COSTLY_ORDER) {
 		VM_BUG_ON(order != pageblock_order);
-		base = PAGE_ALLOC_COSTLY_ORDER + 1;
+		return NR_LOWORDER_PCP_LISTS;
 	}
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -667,7 +667,7 @@ static inline int pindex_to_order(unsigned int pindex)
 	int order = pindex / MIGRATE_PCPTYPES;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (order > PAGE_ALLOC_COSTLY_ORDER)
+	if (pindex == NR_LOWORDER_PCP_LISTS)
 		order = pageblock_order;
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
-- 
cgit v1.2.3


From 4b23a68f953628eb4e4b7fe1294ebf93d4b8ceee Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 24 Jun 2022 13:54:21 +0100
Subject: mm/page_alloc: protect PCP lists with a spinlock

Currently the PCP lists are protected by using local_lock_irqsave to
prevent migration and IRQ reentrancy but this is inconvenient.  Remote
draining of the lists is impossible and a workqueue is required and every
task allocation/free must disable then enable interrupts which is
expensive.

As preparation for dealing with both of those problems, protect the
lists with a spinlock.  The IRQ-unsafe version of the lock is used
because IRQs are already disabled by local_lock_irqsave.  spin_trylock
is used in combination with local_lock_irqsave() but later will be
replaced with a spin_trylock_irqsave when the local_lock is removed.

The per_cpu_pages still fits within the same number of cache lines after
this patch relative to before the series.

struct per_cpu_pages {
        spinlock_t                 lock;                 /*     0     4 */
        int                        count;                /*     4     4 */
        int                        high;                 /*     8     4 */
        int                        batch;                /*    12     4 */
        short int                  free_factor;          /*    16     2 */
        short int                  expire;               /*    18     2 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           lists[13];            /*    24   208 */

        /* size: 256, cachelines: 4, members: 7 */
        /* sum members: 228, holes: 1, sum holes: 4 */
        /* padding: 24 */
} __attribute__((__aligned__(64)));

There is overhead in the fast path due to acquiring the spinlock even
though the spinlock is per-cpu and uncontended in the common case.  Page
Fault Test (PFT) running on a 1-socket reported the following results on a
1 socket machine.

                                     5.19.0-rc3               5.19.0-rc3
                                        vanilla      mm-pcpspinirq-v5r16
Hmean     faults/sec-1   869275.7381 (   0.00%)   874597.5167 *   0.61%*
Hmean     faults/sec-3  2370266.6681 (   0.00%)  2379802.0362 *   0.40%*
Hmean     faults/sec-5  2701099.7019 (   0.00%)  2664889.7003 *  -1.34%*
Hmean     faults/sec-7  3517170.9157 (   0.00%)  3491122.8242 *  -0.74%*
Hmean     faults/sec-8  3965729.6187 (   0.00%)  3939727.0243 *  -0.66%*

There is a small hit in the number of faults per second but given that the
results are more stable, it's borderline noise.

[akpm@linux-foundation.org: add missing local_unlock_irqrestore() on contention path]
Link: https://lkml.kernel.org/r/20220624125423.6126-6-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Tested-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |   1 +
 mm/page_alloc.c        | 119 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 99 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 041136b5628a..578247a341b2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -382,6 +382,7 @@ enum zone_watermarks {
 
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
+	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 026c9437456c..a08ec4ac7ef2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -133,6 +133,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -3101,15 +3115,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long flags;
 	int to_drain, batch;
 
-	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
-	if (to_drain > 0)
+	if (to_drain > 0) {
+		unsigned long flags;
+
+		/*
+		 * free_pcppages_bulk expects IRQs disabled for zone->lock
+		 * so even though pcp->lock is not intended to be IRQ-safe,
+		 * it's needed in this context.
+		 */
+		spin_lock_irqsave(&pcp->lock, flags);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 #endif
 
@@ -3122,16 +3143,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_lock_irqsave(&pagesets.lock, flags);
-
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
-		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+	if (pcp->count) {
+		unsigned long flags;
 
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		/* See drain_zone_pages on why this is disabling IRQs */
+		spin_lock_irqsave(&pcp->lock, flags);
+		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 
 /*
@@ -3399,17 +3421,15 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	return min(READ_ONCE(pcp->batch) << 2, high);
 }
 
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+				   struct page *page, int migratetype,
 				   unsigned int order)
 {
-	struct zone *zone = page_zone(page);
-	struct per_cpu_pages *pcp;
 	int high;
 	int pindex;
 	bool free_high;
 
 	__count_vm_event(PGFREE);
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
@@ -3436,6 +3456,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
 void free_unref_page(struct page *page, unsigned int order)
 {
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
+	struct per_cpu_pages *pcp;
+	struct zone *zone;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
@@ -3459,7 +3482,16 @@ void free_unref_page(struct page *page, unsigned int order)
 	}
 
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, migratetype, order);
+	zone = page_zone(page);
+	pcp_trylock_prepare(UP_flags);
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	if (spin_trylock(&pcp->lock)) {
+		free_unref_page_commit(zone, pcp, page, migratetype, order);
+		spin_unlock(&pcp->lock);
+	} else {
+		free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+	}
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3469,6 +3501,8 @@ void free_unref_page(struct page *page, unsigned int order)
 void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
+	struct per_cpu_pages *pcp = NULL;
+	struct zone *locked_zone = NULL;
 	unsigned long flags;
 	int batch_count = 0;
 	int migratetype;
@@ -3495,6 +3529,17 @@ void free_unref_page_list(struct list_head *list)
 
 	local_lock_irqsave(&pagesets.lock, flags);
 	list_for_each_entry_safe(page, next, list, lru) {
+		struct zone *zone = page_zone(page);
+
+		/* Different zone, different pcp lock. */
+		if (zone != locked_zone) {
+			if (pcp)
+				spin_unlock(&pcp->lock);
+			locked_zone = zone;
+			pcp = this_cpu_ptr(zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
+		}
+
 		/*
 		 * Non-isolated types over MIGRATE_PCPTYPES get added
 		 * to the MIGRATE_MOVABLE pcp list.
@@ -3504,18 +3549,24 @@ void free_unref_page_list(struct list_head *list)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, migratetype, 0);
+		free_unref_page_commit(zone, pcp, page, migratetype, 0);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
+			spin_unlock(&pcp->lock);
 			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
 			local_lock_irqsave(&pagesets.lock, flags);
+			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
 		}
 	}
+
+	if (pcp)
+		spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3729,18 +3780,32 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct list_head *list;
 	struct page *page;
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
 
 	local_lock_irqsave(&pagesets.lock, flags);
 
+	/*
+	 * spin_trylock may fail due to a parallel drain. In the future, the
+	 * trylock will also protect against IRQ reentrancy.
+	 */
+	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	pcp_trylock_prepare(UP_flags);
+	if (!spin_trylock(&pcp->lock)) {
+		pcp_trylock_finish(UP_flags);
+		local_unlock_irqrestore(&pagesets.lock, flags);
+		return NULL;
+	}
+
 	/*
 	 * On allocation, reduce the number of pages that are batch freed.
 	 * See nr_pcp_free() where free_factor is increased for subsequent
 	 * frees.
 	 */
-	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+	spin_unlock(&pcp->lock);
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3775,7 +3840,8 @@ struct page *rmqueue(struct zone *preferred_zone,
 				migratetype != MIGRATE_MOVABLE) {
 			page = rmqueue_pcplist(preferred_zone, zone, order,
 					gfp_flags, migratetype, alloc_flags);
-			goto out;
+			if (likely(page))
+				goto out;
 		}
 	}
 
@@ -5260,6 +5326,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 {
 	struct page *page;
 	unsigned long flags;
+	unsigned long __maybe_unused UP_flags;
 	struct zone *zone;
 	struct zoneref *z;
 	struct per_cpu_pages *pcp;
@@ -5340,11 +5407,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	if (unlikely(!zone))
 		goto failed;
 
-	/* Attempt the batch allocation */
+	/* Is a parallel drain in progress? */
 	local_lock_irqsave(&pagesets.lock, flags);
+	pcp_trylock_prepare(UP_flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
-	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+	if (!spin_trylock(&pcp->lock))
+		goto failed_irq;
 
+	/* Attempt the batch allocation */
+	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
 	while (nr_populated < nr_pages) {
 
 		/* Skip existing pages */
@@ -5357,8 +5428,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 								pcp, pcp_list);
 		if (unlikely(!page)) {
 			/* Try and allocate at least one page */
-			if (!nr_account)
+			if (!nr_account) {
+				spin_unlock(&pcp->lock);
 				goto failed_irq;
+			}
 			break;
 		}
 		nr_account++;
@@ -5371,6 +5444,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
+	spin_unlock(&pcp->lock);
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -5380,6 +5455,7 @@ out:
 	return nr_populated;
 
 failed_irq:
+	pcp_trylock_finish(UP_flags);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 failed:
@@ -7020,6 +7096,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
+	spin_lock_init(&pcp->lock);
 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);
 
-- 
cgit v1.2.3


From 840532711d7299d7e937952482ec899d4622c452 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:35 +0530
Subject: mm/mmap: build protect protection_map[] with __P000

Patch series "mm/mmap: Drop __SXXX/__PXXX macros from across platforms",
v7.

__SXXX/__PXXX macros are unnecessary abstraction layer in creating the
generic protection_map[] array which is used for vm_get_page_prot().  This
abstraction layer can be avoided, if the platforms just define the array
protection_map[] for all possible vm_flags access permission combinations
and also export vm_get_page_prot() implementation.

This series drops __SXXX/__PXXX macros from across platforms in the tree.
First it build protects generic protection_map[] array with '#ifdef
__P000' and moves it inside platforms which enable
ARCH_HAS_VM_GET_PAGE_PROT.  Later this build protects same array with
'#ifdef ARCH_HAS_VM_GET_PAGE_PROT' and moves inside remaining platforms
while enabling ARCH_HAS_VM_GET_PAGE_PROT.  This adds a new macro
DECLARE_VM_GET_PAGE_PROT defining the current generic vm_get_page_prot(),
in order for it to be reused on platforms that do not require custom
implementation.  Finally, ARCH_HAS_VM_GET_PAGE_PROT can just be dropped,
as all platforms now define and export vm_get_page_prot(), via looking up
a private and static protection_map[] array.  protection_map[] data type
has been changed as 'static const' on all platforms that do not change it
during boot.


This patch (of 26):

Build protect generic protection_map[] array with __P000, so that it can
be moved inside all the platforms one after the other.  Otherwise there
will be build failures during this process.
CONFIG_ARCH_HAS_VM_GET_PAGE_PROT cannot be used for this purpose as only
certain platforms enable this config now.

Link: https://lkml.kernel.org/r/20220711070600.2378316-1-anshuman.khandual@arm.com
Link: https://lkml.kernel.org/r/20220711070600.2378316-2-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 mm/mmap.c          | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4ebfc206e2b..1a435ce146a2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,7 +425,9 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
+#ifdef __P000
 extern pgprot_t protection_map[16];
+#endif
 
 /*
  * The default fault flags that should be used by most of the
diff --git a/mm/mmap.c b/mm/mmap.c
index c14d7286a379..def0e03cf25c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -101,6 +101,7 @@ static void unmap_region(struct mm_struct *mm,
  *								w: (no) no
  *								x: (yes) yes
  */
+#ifdef __P000
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
 	[VM_READ]					= __P001,
@@ -119,6 +120,7 @@ pgprot_t protection_map[16] __ro_after_init = {
 	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
 };
+#endif
 
 #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 pgprot_t vm_get_page_prot(unsigned long vm_flags)
-- 
cgit v1.2.3


From 43957b5d11037a651d162f65c682ec3c76777fc8 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:36 +0530
Subject: mm/mmap: define DECLARE_VM_GET_PAGE_PROT

This just converts the generic vm_get_page_prot() implementation into a
new macro i.e DECLARE_VM_GET_PAGE_PROT which later can be used across
platforms when enabling them with ARCH_HAS_VM_GET_PAGE_PROT.  This does
not create any functional change.

Link: https://lkml.kernel.org/r/20220711070600.2378316-3-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++
 mm/mmap.c               | 26 +-------------------------
 2 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 3cdc16cfd867..014ee8f0fbaa 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1689,4 +1689,32 @@ typedef unsigned int pgtbl_mod_mask;
 #define MAX_PTRS_PER_P4D PTRS_PER_P4D
 #endif
 
+/* description of effects of mapping type and prot in current implementation.
+ * this is due to the limited x86 page protection hardware.  The expected
+ * behavior is in parens:
+ *
+ * map_type	prot
+ *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
+ * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
+ *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
+ *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
+ *
+ * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
+ * MAP_PRIVATE (with Enhanced PAN supported):
+ *								r: (no) no
+ *								w: (no) no
+ *								x: (yes) yes
+ */
+#define DECLARE_VM_GET_PAGE_PROT					\
+pgprot_t vm_get_page_prot(unsigned long vm_flags)			\
+{									\
+		return protection_map[vm_flags &			\
+			(VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];	\
+}									\
+EXPORT_SYMBOL(vm_get_page_prot);
+
 #endif /* _LINUX_PGTABLE_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index def0e03cf25c..3c0d65743bc4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,26 +81,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-/* description of effects of mapping type and prot in current implementation.
- * this is due to the limited x86 page protection hardware.  The expected
- * behavior is in parens:
- *
- * map_type	prot
- *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
- * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
- *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
- *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
- *
- * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
- *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
- *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE (with Enhanced PAN supported):
- *								r: (no) no
- *								w: (no) no
- *								x: (yes) yes
- */
 #ifdef __P000
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
@@ -123,11 +103,7 @@ pgprot_t protection_map[16] __ro_after_init = {
 #endif
 
 #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
-{
-	return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
-}
-EXPORT_SYMBOL(vm_get_page_prot);
+DECLARE_VM_GET_PAGE_PROT
 #endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
 
 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
-- 
cgit v1.2.3


From 09095f74130dfb2110ef2bcdd9ad0d42addaa1d5 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:35:41 +0530
Subject: mm/mmap: build protect protection_map[] with
 ARCH_HAS_VM_GET_PAGE_PROT

Now that protection_map[] has been moved inside those platforms that
enable ARCH_HAS_VM_GET_PAGE_PROT.  Hence generic protection_map[] array
now can be protected with CONFIG_ARCH_HAS_VM_GET_PAGE_PROT intead of
__P000.

Link: https://lkml.kernel.org/r/20220711070600.2378316-8-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/mmap.c          | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a435ce146a2..4b4dc93f9bc3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,7 +425,7 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
-#ifdef __P000
+#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 extern pgprot_t protection_map[16];
 #endif
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 3c0d65743bc4..2a58a9cd0752 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,7 +81,7 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-#ifdef __P000
+#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 pgprot_t protection_map[16] __ro_after_init = {
 	[VM_NONE]					= __P000,
 	[VM_READ]					= __P001,
@@ -100,9 +100,6 @@ pgprot_t protection_map[16] __ro_after_init = {
 	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
 };
-#endif
-
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
 DECLARE_VM_GET_PAGE_PROT
 #endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
 
-- 
cgit v1.2.3


From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Jul 2022 12:36:00 +0530
Subject: mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT

Now all the platforms enable ARCH_HAS_GET_PAGE_PROT.  They define and
export own vm_get_page_prot() whether custom or standard
DECLARE_VM_GET_PAGE_PROT.  Hence there is no need for default generic
fallback for vm_get_page_prot().  Just drop this fallback and also
ARCH_HAS_GET_PAGE_PROT mechanism.

Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brian Cain <bcain@quicinc.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/Kconfig      |  1 -
 arch/arc/Kconfig        |  1 -
 arch/arm/Kconfig        |  1 -
 arch/arm64/Kconfig      |  1 -
 arch/csky/Kconfig       |  1 -
 arch/hexagon/Kconfig    |  1 -
 arch/ia64/Kconfig       |  1 -
 arch/loongarch/Kconfig  |  1 -
 arch/m68k/Kconfig       |  1 -
 arch/microblaze/Kconfig |  1 -
 arch/mips/Kconfig       |  1 -
 arch/nios2/Kconfig      |  1 -
 arch/openrisc/Kconfig   |  1 -
 arch/parisc/Kconfig     |  1 -
 arch/powerpc/Kconfig    |  1 -
 arch/riscv/Kconfig      |  1 -
 arch/s390/Kconfig       |  1 -
 arch/sh/Kconfig         |  1 -
 arch/sparc/Kconfig      |  1 -
 arch/um/Kconfig         |  1 -
 arch/x86/Kconfig        |  1 -
 arch/xtensa/Kconfig     |  1 -
 include/linux/mm.h      |  3 ---
 mm/Kconfig              |  3 ---
 mm/mmap.c               | 22 ----------------------
 25 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index db1c8b329461..7d0d26b5b3f5 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -2,7 +2,6 @@
 config ALPHA
 	bool
 	default y
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_32BIT_USTAT_F_TINODE
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 8be56a5d8a9b..9e3653253ef2 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -13,7 +13,6 @@ config ARC
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select ARCH_32BIT_OFF_T
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e153b6d4fc5b..7630ba9cb6cc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -24,7 +24,6 @@ config ARM
 	select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU
 	select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..7030bf3f8d6f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -45,7 +45,6 @@ config ARM64
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
 	select ARCH_HAVE_ELF_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 588b8a9c68ed..21d72b078eef 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -6,7 +6,6 @@ config CSKY
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace)
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index bc4ceecd0588..54eadf265178 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -6,7 +6,6 @@ config HEXAGON
 	def_bool y
 	select ARCH_32BIT_OFF_T
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_PREEMPT
 	select DMA_GLOBAL_POOL
 	# Other pending projects/to-do items.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 0510a5737711..cb93769a9f2a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -12,7 +12,6 @@ config IA64
 	select ARCH_HAS_DMA_MARK_CLEAN
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select ACPI
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index adf8cf6ec5d5..db2838cf8c02 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -9,7 +9,6 @@ config LOONGARCH
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PTE_SPECIAL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_INLINE_READ_LOCK if !PREEMPTION
 	select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 49aa0cf13e96..936cce42ae9a 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -7,7 +7,6 @@ config M68K
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
 	select ARCH_MIGHT_HAVE_PC_PARPORT if ISA
 	select ARCH_NO_PREEMPT if !COLDFIRE
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 15f91ba8a0c4..8cf429ad1c84 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -7,7 +7,6 @@ config MICROBLAZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d0b7eb11ec81..db09d45d59ec 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -14,7 +14,6 @@ config MIPS
 	select ARCH_HAS_STRNLEN_USER
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_SUPPORTS_UPROBES
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index e0459dffd218..4167f1eb4cd8 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -6,7 +6,6 @@ config NIOS2
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
 	select ARCH_HAS_DMA_SET_UNCACHED
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_SWAP
 	select COMMON_CLK
 	select TIMER_OF
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index fe0dfb50eb86..e814df4c483c 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -10,7 +10,6 @@ config OPENRISC
 	select ARCH_HAS_DMA_SET_UNCACHED
 	select ARCH_HAS_DMA_CLEAR_UNCACHED
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select COMMON_CLK
 	select OF
 	select OF_EARLY_FLATTREE
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 891d82393957..fa400055b2d5 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -12,7 +12,6 @@ config PARISC
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_NO_SG_CHAIN
 	select ARCH_SUPPORTS_HUGETLBFS if PA20
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1035d172c7dd..250b8658b2d4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -140,7 +140,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 583389d4e43a..32ffef9f6e5b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -32,7 +32,6 @@ config RISCV
 	select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
 	select ARCH_STACKWALK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c4481377ca83..91c0b80a8bf0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -81,7 +81,6 @@ config S390
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_VDSO_DATA
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_INLINE_READ_LOCK
 	select ARCH_INLINE_READ_LOCK_BH
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 91f3ea325388..5f220e903e5a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -12,7 +12,6 @@ config SUPERH
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 09f868613a4d..9c1cce74953a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -13,7 +13,6 @@ config 64BIT
 config SPARC
 	bool
 	default y
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select DMA_OPS
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 7fb43654e5b5..4ec22e156a2e 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -10,7 +10,6 @@ config UML
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_NO_PREEMPT
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index be0b95e51df6..841e4843d0c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,7 +94,6 @@ config X86
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_ZONE_DMA_SET if EXPERT
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 4c0d83520ff1..0b0f0172cced 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -11,7 +11,6 @@ config XTENSA
 	select ARCH_HAS_DMA_SET_UNCACHED if MMU
 	select ARCH_HAS_STRNCPY_FROM_USER if !KASAN
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_HAS_VM_GET_PAGE_PROT
 	select ARCH_USE_MEMTEST
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4b4dc93f9bc3..61e3101c44ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -425,9 +425,6 @@ extern unsigned int kobjsize(const void *objp);
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-extern pgprot_t protection_map[16];
-#endif
 
 /*
  * The default fault flags that should be used by most of the
diff --git a/mm/Kconfig b/mm/Kconfig
index c1fa4993a56f..56ca0e7c6f9a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -951,9 +951,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
 	  register alias named "current_stack_pointer", this config can be
 	  selected.
 
-config ARCH_HAS_VM_GET_PAGE_PROT
-	bool
-
 config ARCH_HAS_PTE_DEVMAP
 	bool
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 2a58a9cd0752..edf27a2789a2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,28 +81,6 @@ static void unmap_region(struct mm_struct *mm,
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-pgprot_t protection_map[16] __ro_after_init = {
-	[VM_NONE]					= __P000,
-	[VM_READ]					= __P001,
-	[VM_WRITE]					= __P010,
-	[VM_WRITE | VM_READ]				= __P011,
-	[VM_EXEC]					= __P100,
-	[VM_EXEC | VM_READ]				= __P101,
-	[VM_EXEC | VM_WRITE]				= __P110,
-	[VM_EXEC | VM_WRITE | VM_READ]			= __P111,
-	[VM_SHARED]					= __S000,
-	[VM_SHARED | VM_READ]				= __S001,
-	[VM_SHARED | VM_WRITE]				= __S010,
-	[VM_SHARED | VM_WRITE | VM_READ]		= __S011,
-	[VM_SHARED | VM_EXEC]				= __S100,
-	[VM_SHARED | VM_EXEC | VM_READ]			= __S101,
-	[VM_SHARED | VM_EXEC | VM_WRITE]		= __S110,
-	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __S111
-};
-DECLARE_VM_GET_PAGE_PROT
-#endif	/* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
-
 static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
 {
 	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
-- 
cgit v1.2.3


From 3ce4fee4401206cf5a2c476ec0ee6c90191dfade Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 4 Jul 2022 21:21:55 +0800
Subject: mm/huge_memory: check pmd_present first in is_huge_zero_pmd

When pmd is non-present, pmd_pfn returns an insane value. So we should
check pmd_present first to avoid acquiring such insane value and also
avoid touching possible cold huge_zero_pfn cache line when pmd isn't
present.

Link: https://lkml.kernel.org/r/20220704132201.14611-11-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ae3d8e2fd9e2..12b297f9951d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -273,7 +273,7 @@ static inline bool is_huge_zero_page(struct page *page)
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
-	return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd);
+	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
 }
 
 static inline bool is_huge_zero_pud(pud_t pud)
-- 
cgit v1.2.3


From 121c1781aeb00475d163246d9ae7d8746e377040 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Mon, 4 Jul 2022 21:21:58 +0800
Subject: mm/huge_memory: fix comment of page_deferred_list

The current comment is confusing because if global or memcg deferred list
in the second tail page is occupied by compound_head, why we still use
page[2].deferred_list here? I think it wants to say that Global or memcg
deferred list in the first tail page is occupied by compound_mapcount and
compound_pincount so we use the second tail page's deferred_list instead.

Link: https://lkml.kernel.org/r/20220704132201.14611-14-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 12b297f9951d..37f2f11a6d7e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -294,8 +294,8 @@ static inline bool thp_migration_supported(void)
 static inline struct list_head *page_deferred_list(struct page *page)
 {
 	/*
-	 * Global or memcg deferred list in the second tail pages is
-	 * occupied by compound_head.
+	 * See organization of tail pages of compound page in
+	 * "struct page" definition.
 	 */
 	return &page[2].deferred_list;
 }
-- 
cgit v1.2.3


From dcadcf1c30619ead2f3280bfb7f74de8304be2bb Mon Sep 17 00:00:00 2001
From: Gang Li <ligang.bdlg@bytedance.com>
Date: Wed, 6 Jul 2022 11:46:54 +0800
Subject: mm, hugetlb: skip irrelevant nodes in show_free_areas()

show_free_areas() allows to filter out node specific data which is
irrelevant to the allocation request.  But hugetlb_show_meminfo() still
shows hugetlb on all nodes, which is redundant and unnecessary.

Use show_mem_node_skip() to skip irrelevant nodes.  And replace
hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid).

before-and-after sample output of OOM:

before:
```
[  214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB
[  214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
[  214.388334] lowmem_reserve[]: 0 0 0 0 0
[  214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20
[  214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
```

after:
```
[  145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u
[  145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig
[  145.152315] lowmem_reserve[]: 0 0 0 0 0
[  145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME)
[  145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
```

Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.com
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 18 ++++++++----------
 mm/page_alloc.c         |  8 ++++++--
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 75ee739d815b..4cdfce976644 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -152,7 +152,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 			  struct page *ref_page, zap_flags_t zap_flags);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
-void hugetlb_show_meminfo(void);
+void hugetlb_show_meminfo_node(int nid);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
@@ -298,7 +298,7 @@ static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
 	return 0;
 }
 
-static inline void hugetlb_show_meminfo(void)
+static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 96635a2874e3..bb763f5d30b9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4477,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
 			     nid, h->surplus_huge_pages_node[nid]);
 }
 
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
 {
 	struct hstate *h;
-	int nid;
 
 	if (!hugepages_supported())
 		return;
 
-	for_each_node_state(nid, N_MEMORY)
-		for_each_hstate(h)
-			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
-				nid,
-				h->nr_huge_pages_node[nid],
-				h->free_huge_pages_node[nid],
-				h->surplus_huge_pages_node[nid],
-				huge_page_size(h) / SZ_1K);
+	for_each_hstate(h)
+		printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+			nid,
+			h->nr_huge_pages_node[nid],
+			h->free_huge_pages_node[nid],
+			h->surplus_huge_pages_node[nid],
+			huge_page_size(h) / SZ_1K);
 }
 
 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 215b26664ad7..4fa96d3510fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6022,7 +6022,7 @@ static void show_migration_types(unsigned char type)
 void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
-	int cpu;
+	int cpu, nid;
 	struct zone *zone;
 	pg_data_t *pgdat;
 
@@ -6210,7 +6210,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		printk(KERN_CONT "= %lukB\n", K(total));
 	}
 
-	hugetlb_show_meminfo();
+	for_each_online_node(nid) {
+		if (show_mem_node_skip(filter, nid, nodemask))
+			continue;
+		hugetlb_show_meminfo_node(nid);
+	}
 
 	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
 
-- 
cgit v1.2.3


From 0d8bc0b10aeab543bdccb86180f58db1f79f7cee Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Wed, 13 Jul 2022 20:53:14 +0800
Subject: writeback: cleanup bdi_sched_wait()

bdi_sched_wait() is no longer used since commit 839a8e8660b6 ("writeback:
replace custom worker pool implementation with unbound workqueue"), so
remove it.

Link: https://lkml.kernel.org/r/20220713125314.171345-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d452071db572..e84b745a6811 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -140,12 +140,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping)
 	return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
 }
 
-static inline int bdi_sched_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
-- 
cgit v1.2.3


From 62df90b53e6f332bb69b73621998826c49a17323 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Sun, 19 Jun 2022 15:46:41 +0800
Subject: net, lib/once: remove {net_}get_random_once_wait macro

DO_ONCE(func, ...) will call func with spinlock which acquired by
spin_lock_irqsave in __do_once_start.  But the get_random_once_wait will
sleep in get_random_bytes_wait -> wait_for_random_bytes.

Fortunately, there is no place to use {net_}get_random_once_wait, so we
could remove them simply.

Link: https://lkml.kernel.org/r/20220619074641.40916-1-wuchi.zero@gmail.com
Signed-off-by: wuchi <wuchi.zero@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/net.h  | 2 --
 include/linux/once.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 12093f4db50c..8613772a1f58 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -303,8 +303,6 @@ do {									\
 
 #define net_get_random_once(buf, nbytes)			\
 	get_random_once((buf), (nbytes))
-#define net_get_random_once_wait(buf, nbytes)			\
-	get_random_once_wait((buf), (nbytes))
 
 /*
  * E.g. XFS meta- & log-data is in slab pages, or bcache meta
diff --git a/include/linux/once.h b/include/linux/once.h
index f54523052bbc..b14d8b309d52 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
-#define get_random_once_wait(buf, nbytes)                                    \
-	DO_ONCE(get_random_bytes_wait, (buf), (nbytes))                      \
 
 #endif /* _LINUX_ONCE_H */
-- 
cgit v1.2.3


From 43c249ea0b1e10baac4a1264a25d69723ce5d2c2 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 24 Jun 2022 16:14:12 +0200
Subject: compiler-gcc.h: remove ancient workaround for gcc PR 58670

The workaround for 'asm goto' miscompilation introduces a compiler barrier
quirk that inhibits many useful compiler optimizations.  For example,
__try_cmpxchg_user compiles to:

   11375:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11379:	41 8b 02             	mov    (%r10),%eax
   1137c:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   11380:	0f 94 c2             	sete   %dl
   11383:	84 d2                	test   %dl,%dl
   11385:	75 c4                	jne    1134b <...>
   11387:	41 89 02             	mov    %eax,(%r10)

where the barrier inhibits flags propagation from asm when compiled with
gcc-12.

When the mentioned quirk is removed, the following code is generated:

   11553:	41 8b 4d 00          	mov    0x0(%r13),%ecx
   11557:	41 8b 02             	mov    (%r10),%eax
   1155a:	f0 0f b1 0a          	lock cmpxchg %ecx,(%rdx)
   1155e:	74 c9                	je     11529 <...>
   11560:	41 89 02             	mov    %eax,(%r10)

The refered compiler bug:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670

was fixed for gcc-4.8.2.

Current minimum required version of GCC is version 5.1 which has the above
'asm goto' miscompilation fixed, so remove the workaround.

Link: https://lkml.kernel.org/r/20220624141412.72274-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a0c55eeaeaf1..9b157b71036f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,17 +66,6 @@
 		__builtin_unreachable();	\
 	} while (0)
 
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
 #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__
-- 
cgit v1.2.3


From 045ed31e23aea840648c290dbde04797064960db Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 24 Jun 2022 08:30:04 +0300
Subject: kfifo: fix kfifo_to_user() return type

The kfifo_to_user() macro is supposed to return zero for success or
negative error codes.  Unfortunately, there is a signedness bug so it
returns unsigned int.  This only affects callers which try to save the
result in ssize_t and as far as I can see the only place which does that
is line6_hwdep_read().

TL;DR: s/_uint/_int/.

Link: https://lkml.kernel.org/r/YrVL3OJVLlNhIMFs@kili
Fixes: 144ecf310eb5 ("kfifo: fix kfifo_alloc() to return a signed int value")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kfifo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 86249476b57f..0b35a41440ff 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -688,7 +688,7 @@ __kfifo_uint_must_check_helper( \
  * writer, you don't need extra locking to use these macro.
  */
 #define	kfifo_to_user(fifo, to, len, copied) \
-__kfifo_uint_must_check_helper( \
+__kfifo_int_must_check_helper( \
 ({ \
 	typeof((fifo) + 1) __tmp = (fifo); \
 	void __user *__to = (to); \
-- 
cgit v1.2.3


From 4f09903078eeb9138cddce8db06100b82f8620e8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:27 +0200
Subject: cpumask: add UP optimised for_each_*_cpu versions

On uniprocessor builds, the following loops will always run over a mask
that contains one enabled CPU (cpu0):

    - for_each_possible_cpu
    - for_each_online_cpu
    - for_each_present_cpu

Provide uniprocessor-specific macros for these loops, that always run
exactly once.

Link: https://lkml.kernel.org/r/3a92869b902a075b97be5d1452c9c6badbbff0df.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..533612770bc0 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -811,9 +811,16 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 /* First bits of cpu_bit_bitmap are in fact unset. */
 #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
 
+#if NR_CPUS == 1
+/* Uniprocessor: the possible/online/present masks are always "1" */
+#define for_each_possible_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_online_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_present_cpu(cpu)	for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#else
 #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
 #define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
+#endif
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 void init_cpu_present(const struct cpumask *src);
-- 
cgit v1.2.3


From b81dce77cedcea6f00292f02d4b1ebbfc2c5988d Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:25 +0200
Subject: cpumask: Fix invalid uniprocessor mask assumption

On uniprocessor builds, any CPU mask is assumed to contain exactly one CPU
(cpu0).  This assumption ignores the existence of empty masks, resulting
in incorrect behaviour.

cpumask_first_zero(), cpumask_next_zero(), and for_each_cpu_not() don't
provide behaviour matching the assumption that a UP mask is always "1",
and instead provide behaviour matching the empty mask.

Drop the incorrectly optimised code and use the generic implementations in
all cases.

Link: https://lkml.kernel.org/r/86bf3f005abba2d92120ddd0809235cab4f759a6.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Suggested-by: Yury Norov <yury.norov@gmail.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 99 ++++++++++---------------------------------------
 lib/Makefile            |  3 +-
 lib/cpumask.c           |  2 +
 3 files changed, 22 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 533612770bc0..6c5b4ee000f2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -116,85 +116,6 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
 	return cpu;
 }
 
-#if NR_CPUS == 1
-/* Uniprocessor.  Assume all masks are "1". */
-static inline unsigned int cpumask_first(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
-					     const struct cpumask *srcp2)
-{
-	return 0;
-}
-
-static inline unsigned int cpumask_last(const struct cpumask *srcp)
-{
-	return 0;
-}
-
-/* Valid inputs for n are -1 and 0. */
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_and(int n,
-					    const struct cpumask *srcp,
-					    const struct cpumask *andp)
-{
-	return n+1;
-}
-
-static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
-					     int start, bool wrap)
-{
-	/* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
-	return (wrap && n == 0);
-}
-
-/* cpu must be a valid cpu, ie 0, so there's no other choice. */
-static inline unsigned int cpumask_any_but(const struct cpumask *mask,
-					   unsigned int cpu)
-{
-	return 1;
-}
-
-static inline unsigned int cpumask_local_spread(unsigned int i, int node)
-{
-	return 0;
-}
-
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
-	return cpumask_first_and(src1p, src2p);
-}
-
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
-{
-	return cpumask_first(srcp);
-}
-
-#define for_each_cpu(cpu, mask)			\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_not(cpu, mask)		\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_wrap(cpu, mask, start)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask1, mask2)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
-#else
 /**
  * cpumask_first - get the first cpu in a cpumask
  * @srcp: the cpumask pointer
@@ -260,10 +181,29 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 
 int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
 int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+
+#if NR_CPUS == 1
+/* Uniprocessor: there is only one valid CPU */
+static inline unsigned int cpumask_local_spread(unsigned int i, int node)
+{
+	return 0;
+}
+
+static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
+					     const struct cpumask *src2p) {
+	return cpumask_first_and(src1p, src2p);
+}
+
+static inline int cpumask_any_distribute(const struct cpumask *srcp)
+{
+	return cpumask_first(srcp);
+}
+#else
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
 int cpumask_any_distribute(const struct cpumask *srcp);
+#endif /* NR_CPUS */
 
 /**
  * for_each_cpu - iterate over every cpu in a mask
@@ -324,7 +264,6 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
 	for ((cpu) = -1;						\
 		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
-#endif /* SMP */
 
 #define CPU_BITS_NONE						\
 {								\
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..bcc7e8ea0cde 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,10 +34,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
-	 buildid.o
+	 buildid.o cpumask.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
-lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a971a82d2f43..b9728513a4d4 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,7 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif
 
+#if NR_CPUS > 1
 /**
  * cpumask_local_spread - select the i'th cpu with local numa cpu's first
  * @i: index number
@@ -279,3 +280,4 @@ int cpumask_any_distribute(const struct cpumask *srcp)
 	return next;
 }
 EXPORT_SYMBOL(cpumask_any_distribute);
+#endif /* NR_CPUS */
-- 
cgit v1.2.3


From 953257a9252a9b3c58ca68fc5bf26fc65e5b1cb8 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Sat, 2 Jul 2022 18:08:28 +0200
Subject: cpumask: update cpumask_next_wrap() signature

The extern specifier is not needed for this declaration, so drop it.  The
function also depends only on the input parameters, and has no side
effects, so it can be marked __pure like other functions in cpumask.h.

Link: https://lkml.kernel.org/r/72ab755695b74bb5fbaa756ae4c0edd708d172f1.1656777646.git.sander@svanheule.net
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 6c5b4ee000f2..523857884ae4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -229,7 +229,7 @@ int cpumask_any_distribute(const struct cpumask *srcp);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
-- 
cgit v1.2.3


From 91561d4ecb755f056f8ff04f9dcaec210140e55c Mon Sep 17 00:00:00 2001
From: Chao Gao <chao.gao@intel.com>
Date: Fri, 15 Jul 2022 18:45:33 +0800
Subject: swiotlb: remove unused fields in io_tlb_mem

Commit 20347fca71a3 ("swiotlb: split up the global swiotlb lock") splits
io_tlb_mem into multiple areas. Each area has its own lock and index. The
global ones are not used so remove them.

Signed-off-by: Chao Gao <chao.gao@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 5 -----
 kernel/dma/swiotlb.c    | 2 --
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index f65ff1930120..d3ae03edbbd2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -79,11 +79,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
  * @used:	The number of used IO TLB block.
  * @list:	The free list describing the number of free entries available
  *		from each index.
- * @index:	The index to start searching in the next round.
  * @orig_addr:	The original address corresponding to a mapped entry.
  * @alloc_size:	Size of the allocated buffer.
- * @lock:	The lock to protect the above data structures in the map and
- *		unmap calls.
  * @debugfs:	The dentry to debugfs.
  * @late_alloc:	%true if allocated using the page allocator
  * @force_bounce: %true if swiotlb bouncing is forced
@@ -97,8 +94,6 @@ struct io_tlb_mem {
 	void *vaddr;
 	unsigned long nslabs;
 	unsigned long used;
-	unsigned int index;
-	spinlock_t lock;
 	struct dentry *debugfs;
 	bool late_alloc;
 	bool force_bounce;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c50e6fe20f37..cbffa0b1ace5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -253,14 +253,12 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
 	mem->nslabs = nslabs;
 	mem->start = start;
 	mem->end = mem->start + bytes;
-	mem->index = 0;
 	mem->late_alloc = late_alloc;
 	mem->nareas = nareas;
 	mem->area_nslabs = nslabs / mem->nareas;
 
 	mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
 
-	spin_lock_init(&mem->lock);
 	for (i = 0; i < mem->nareas; i++) {
 		spin_lock_init(&mem->areas[i].lock);
 		mem->areas[i].index = 0;
-- 
cgit v1.2.3


From 942a8186eb4451700dadd1d60a2e43ce67605991 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 12 Jul 2022 08:43:07 +0200
Subject: swiotlb: move struct io_tlb_slot to swiotlb.c

No need to expose this structure definition in the header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 6 +-----
 kernel/dma/swiotlb.c    | 6 ++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d3ae03edbbd2..35bc4e281c21 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -101,11 +101,7 @@ struct io_tlb_mem {
 	unsigned int nareas;
 	unsigned int area_nslabs;
 	struct io_tlb_area *areas;
-	struct io_tlb_slot {
-		phys_addr_t orig_addr;
-		size_t alloc_size;
-		unsigned int list;
-	} *slots;
+	struct io_tlb_slot *slots;
 };
 extern struct io_tlb_mem io_tlb_default_mem;
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 608923e8dab1..39dee4004439 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -62,6 +62,12 @@
 
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
+struct io_tlb_slot {
+	phys_addr_t orig_addr;
+	size_t alloc_size;
+	unsigned int list;
+};
+
 static bool swiotlb_force_bounce;
 static bool swiotlb_force_disable;
 
-- 
cgit v1.2.3


From a229cc14f3395311b899e5e582b71efa8dd01df0 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 14 Jul 2022 19:15:24 +0800
Subject: dma-mapping: add dma_opt_mapping_size()

Streaming DMA mapping involving an IOMMU may be much slower for larger
total mapping size. This is because every IOMMU DMA mapping requires an
IOVA to be allocated and freed. IOVA sizes above a certain limit are not
cached, which can have a big impact on DMA mapping performance.

Provide an API for device drivers to know this "optimal" limit, such that
they may try to produce mapping which don't exceed it.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/core-api/dma-api.rst | 14 ++++++++++++++
 include/linux/dma-map-ops.h        |  1 +
 include/linux/dma-mapping.h        |  5 +++++
 kernel/dma/mapping.c               | 12 ++++++++++++
 4 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 6d6d0edd2d27..829f20a193ca 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -204,6 +204,20 @@ Returns the maximum size of a mapping for the device. The size parameter
 of the mapping functions like dma_map_single(), dma_map_page() and
 others should not be larger than the returned value.
 
+::
+
+	size_t
+	dma_opt_mapping_size(struct device *dev);
+
+Returns the maximum optimal size of a mapping for the device.
+
+Mapping larger buffers may take much longer in certain scenarios. In
+addition, for high-rate short-lived streaming mappings, the upfront time
+spent on the mapping may account for an appreciable part of the total
+request lifetime. As such, if splitting larger requests incurs no
+significant performance penalty, then device drivers are advised to
+limit total DMA streaming mappings length to the returned value.
+
 ::
 
 	bool
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d5b06b3a4a6..98ceba6fa848 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -69,6 +69,7 @@ struct dma_map_ops {
 	int (*dma_supported)(struct device *dev, u64 mask);
 	u64 (*get_required_mask)(struct device *dev);
 	size_t (*max_mapping_size)(struct device *dev);
+	size_t (*opt_mapping_size)(void);
 	unsigned long (*get_merge_boundary)(struct device *dev);
 };
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index dca2b1355bb1..fe3849434b2a 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,7 @@ int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
+size_t dma_opt_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
@@ -266,6 +267,10 @@ static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
 }
+static inline size_t dma_opt_mapping_size(struct device *dev)
+{
+	return 0;
+}
 static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	return false;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index db7244291b74..1bfe11b1edb6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -773,6 +773,18 @@ size_t dma_max_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_max_mapping_size);
 
+size_t dma_opt_mapping_size(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	size_t size = SIZE_MAX;
+
+	if (ops && ops->opt_mapping_size)
+		size = ops->opt_mapping_size();
+
+	return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 6d9870b7e5def2450e21316515b9efc0529204dd Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 14 Jul 2022 19:15:25 +0800
Subject: dma-iommu: add iommu_dma_opt_mapping_size()

Add the IOMMU callback for DMA mapping API dma_opt_mapping_size(), which
allows the drivers to know the optimal mapping limit and thus limit the
requested IOVA lengths.

This value is based on the IOVA rcache range limit, as IOVAs allocated
above this limit must always be newly allocated, which may be quite slow.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 6 ++++++
 drivers/iommu/iova.c      | 5 +++++
 include/linux/iova.h      | 2 ++
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f90251572a5d..9e1586447ee8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1459,6 +1459,11 @@ static unsigned long iommu_dma_get_merge_boundary(struct device *dev)
 	return (1UL << __ffs(domain->pgsize_bitmap)) - 1;
 }
 
+static size_t iommu_dma_opt_mapping_size(void)
+{
+	return iova_rcache_range();
+}
+
 static const struct dma_map_ops iommu_dma_ops = {
 	.alloc			= iommu_dma_alloc,
 	.free			= iommu_dma_free,
@@ -1479,6 +1484,7 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.map_resource		= iommu_dma_map_resource,
 	.unmap_resource		= iommu_dma_unmap_resource,
 	.get_merge_boundary	= iommu_dma_get_merge_boundary,
+	.opt_mapping_size	= iommu_dma_opt_mapping_size,
 };
 
 /*
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index db77aa675145..9f00b58d546e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -26,6 +26,11 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 
+unsigned long iova_rcache_range(void)
+{
+	return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1);
+}
+
 static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct iova_domain *iovad;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 320a70e40233..c6ba6d95d79c 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -79,6 +79,8 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
+unsigned long iova_rcache_range(void);
+
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
-- 
cgit v1.2.3


From 2b038e786f8338a3bc22d791000753e0ec113e00 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Jun 2022 20:28:42 +0300
Subject: gpiolib: devres: Get rid of unused devm_gpio_free()

The last user, which in fact was a dead code, has gone a year ago,
previous one 3 years ago. On top of that we want to drop away the
legacy GPIO APIs in the kernel, so take a chance to get rid of
unused devm_gpio_free() and accompanying stuff.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/gpio/gpiolib-devres.c                    | 32 ------------------------
 include/linux/gpio.h                             |  6 -----
 3 files changed, 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 2d39967bafcc..55272942e721 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -277,7 +277,6 @@ GPIO
   devm_gpiochip_add_data()
   devm_gpio_request()
   devm_gpio_request_one()
-  devm_gpio_free()
 
 I2C
   devm_i2c_new_dummy_device()
diff --git a/drivers/gpio/gpiolib-devres.c b/drivers/gpio/gpiolib-devres.c
index 79da85d17b71..16a696249229 100644
--- a/drivers/gpio/gpiolib-devres.c
+++ b/drivers/gpio/gpiolib-devres.c
@@ -375,9 +375,6 @@ void devm_gpiod_put_array(struct device *dev, struct gpio_descs *descs)
 }
 EXPORT_SYMBOL_GPL(devm_gpiod_put_array);
 
-
-
-
 static void devm_gpio_release(struct device *dev, void *res)
 {
 	unsigned *gpio = res;
@@ -385,13 +382,6 @@ static void devm_gpio_release(struct device *dev, void *res)
 	gpio_free(*gpio);
 }
 
-static int devm_gpio_match(struct device *dev, void *res, void *data)
-{
-	unsigned *this = res, *gpio = data;
-
-	return *this == *gpio;
-}
-
 /**
  *      devm_gpio_request - request a GPIO for a managed device
  *      @dev: device to request the GPIO for
@@ -402,11 +392,7 @@ static int devm_gpio_match(struct device *dev, void *res, void *data)
  *      same arguments and performs the same function as
  *      gpio_request().  GPIOs requested with this function will be
  *      automatically freed on driver detach.
- *
- *      If an GPIO allocated with this function needs to be freed
- *      separately, devm_gpio_free() must be used.
  */
-
 int devm_gpio_request(struct device *dev, unsigned gpio, const char *label)
 {
 	unsigned *dr;
@@ -459,24 +445,6 @@ int devm_gpio_request_one(struct device *dev, unsigned gpio,
 }
 EXPORT_SYMBOL_GPL(devm_gpio_request_one);
 
-/**
- *      devm_gpio_free - free a GPIO
- *      @dev: device to free GPIO for
- *      @gpio: GPIO to free
- *
- *      Except for the extra @dev argument, this function takes the
- *      same arguments and performs the same function as gpio_free().
- *      This function instead of gpio_free() should be used to manually
- *      free GPIOs allocated with devm_gpio_request().
- */
-void devm_gpio_free(struct device *dev, unsigned int gpio)
-{
-
-	WARN_ON(devres_release(dev, devm_gpio_release, devm_gpio_match,
-		&gpio));
-}
-EXPORT_SYMBOL_GPL(devm_gpio_free);
-
 static void devm_gpio_chip_release(void *data)
 {
 	struct gpio_chip *gc = data;
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 008ad3ee56b7..a370387fa406 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -95,7 +95,6 @@ struct device;
 int devm_gpio_request(struct device *dev, unsigned gpio, const char *label);
 int devm_gpio_request_one(struct device *dev, unsigned gpio,
 			  unsigned long flags, const char *label);
-void devm_gpio_free(struct device *dev, unsigned int gpio);
 
 #else /* ! CONFIG_GPIOLIB */
 
@@ -240,11 +239,6 @@ static inline int devm_gpio_request_one(struct device *dev, unsigned gpio,
 	return -EINVAL;
 }
 
-static inline void devm_gpio_free(struct device *dev, unsigned int gpio)
-{
-	WARN_ON(1);
-}
-
 #endif /* ! CONFIG_GPIOLIB */
 
 #endif /* __LINUX_GPIO_H */
-- 
cgit v1.2.3


From 2a1192ff0835cb4b0ccd6f6e85c93aa0dc6f66b7 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 17:23:38 +0200
Subject: gpio: twl4030: Drop platform teardown callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no machine providing a teardown callback, so drop the unused
code.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpio-twl4030.c | 11 -----------
 include/linux/mfd/twl.h     |  2 --
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c
index de249726230e..e2cb7cb90c8c 100644
--- a/drivers/gpio/gpio-twl4030.c
+++ b/drivers/gpio/gpio-twl4030.c
@@ -593,18 +593,7 @@ out:
 /* Cannot use as gpio_twl4030_probe() calls us */
 static int gpio_twl4030_remove(struct platform_device *pdev)
 {
-	struct twl4030_gpio_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct gpio_twl4030_priv *priv = platform_get_drvdata(pdev);
-	int status;
-
-	if (pdata && pdata->teardown) {
-		status = pdata->teardown(&pdev->dev, priv->gpio_chip.base,
-					 TWL4030_GPIO_MAX);
-		if (status) {
-			dev_dbg(&pdev->dev, "teardown --> %d\n", status);
-			return status;
-		}
-	}
 
 	gpiochip_remove(&priv->gpio_chip);
 
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 8871cc5188a0..c8cd31756037 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -594,8 +594,6 @@ struct twl4030_gpio_platform_data {
 
 	int		(*setup)(struct device *dev,
 				unsigned gpio, unsigned ngpio);
-	int		(*teardown)(struct device *dev,
-				unsigned gpio, unsigned ngpio);
 };
 
 struct twl4030_madc_platform_data {
-- 
cgit v1.2.3


From 7e55b33d3f18fde5c7a57b6c52d80499485c737f Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 21:48:02 +0200
Subject: gpio: ucb1400: Remove platform setup and teardown support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no user of these callbacks. The motivation for this change is
to stop returning an error code from the remove callback.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpio-ucb1400.c | 20 --------------------
 drivers/mfd/ucb1400_core.c  |  6 ++----
 include/linux/ucb1400.h     |  2 --
 3 files changed, 2 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-ucb1400.c b/drivers/gpio/gpio-ucb1400.c
index d2a8644864c3..386e69300332 100644
--- a/drivers/gpio/gpio-ucb1400.c
+++ b/drivers/gpio/gpio-ucb1400.c
@@ -64,34 +64,14 @@ static int ucb1400_gpio_probe(struct platform_device *dev)
 	ucb->gc.can_sleep = true;
 
 	err = devm_gpiochip_add_data(&dev->dev, &ucb->gc, ucb);
-	if (err)
-		goto err;
-
-	if (ucb->gpio_setup)
-		err = ucb->gpio_setup(&dev->dev, ucb->gc.ngpio);
 
 err:
 	return err;
 
 }
 
-static int ucb1400_gpio_remove(struct platform_device *dev)
-{
-	int err = 0;
-	struct ucb1400_gpio *ucb = platform_get_drvdata(dev);
-
-	if (ucb && ucb->gpio_teardown) {
-		err = ucb->gpio_teardown(&dev->dev, ucb->gc.ngpio);
-		if (err)
-			return err;
-	}
-
-	return err;
-}
-
 static struct platform_driver ucb1400_gpio_driver = {
 	.probe	= ucb1400_gpio_probe,
-	.remove	= ucb1400_gpio_remove,
 	.driver	= {
 		.name	= "ucb1400_gpio"
 	},
diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c
index 8c3832a58ef6..ac1d18039568 100644
--- a/drivers/mfd/ucb1400_core.c
+++ b/drivers/mfd/ucb1400_core.c
@@ -72,11 +72,9 @@ static int ucb1400_core_probe(struct device *dev)
 
 	/* GPIO */
 	ucb_gpio.ac97 = ac97;
-	if (pdata) {
-		ucb_gpio.gpio_setup = pdata->gpio_setup;
-		ucb_gpio.gpio_teardown = pdata->gpio_teardown;
+	if (pdata)
 		ucb_gpio.gpio_offset = pdata->gpio_offset;
-	}
+
 	ucb->ucb1400_gpio = platform_device_alloc("ucb1400_gpio", -1);
 	if (!ucb->ucb1400_gpio) {
 		err = -ENOMEM;
diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index 0968ef458447..22345391350b 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -84,8 +84,6 @@ struct ucb1400_gpio {
 	struct gpio_chip	gc;
 	struct snd_ac97		*ac97;
 	int			gpio_offset;
-	int			(*gpio_setup)(struct device *dev, int ngpio);
-	int			(*gpio_teardown)(struct device *dev, int ngpio);
 };
 
 struct ucb1400_ts {
-- 
cgit v1.2.3


From c269df8c5ad316eac47a9078e7a2339e1956637a Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Wed, 13 Jul 2022 15:14:18 +0200
Subject: gpiolib: add support for bias pull disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change prepares the gpio core to look at firmware flags and set
'FLAG_BIAS_DISABLE' if necessary. It works in similar way to
'GPIO_PULL_DOWN' and 'GPIO_PULL_UP'.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpiolib.c       | 8 ++++++--
 include/linux/gpio/machine.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9535f48e18d1..0692ec84d3b0 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3945,9 +3945,11 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 	if (lflags & GPIO_OPEN_SOURCE)
 		set_bit(FLAG_OPEN_SOURCE, &desc->flags);
 
-	if ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) {
+	if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) ||
+	    ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) ||
+	    ((lflags & GPIO_PULL_DOWN) && (lflags & GPIO_PULL_DISABLE))) {
 		gpiod_err(desc,
-			  "both pull-up and pull-down enabled, invalid configuration\n");
+			  "multiple pull-up, pull-down or pull-disable enabled, invalid configuration\n");
 		return -EINVAL;
 	}
 
@@ -3955,6 +3957,8 @@ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
 		set_bit(FLAG_PULL_UP, &desc->flags);
 	else if (lflags & GPIO_PULL_DOWN)
 		set_bit(FLAG_PULL_DOWN, &desc->flags);
+	else if (lflags & GPIO_PULL_DISABLE)
+		set_bit(FLAG_BIAS_DISABLE, &desc->flags);
 
 	ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY));
 	if (ret < 0)
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index 4d55da28e664..0b619eb7ae83 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -14,6 +14,7 @@ enum gpio_lookup_flags {
 	GPIO_TRANSITORY			= (1 << 3),
 	GPIO_PULL_UP			= (1 << 4),
 	GPIO_PULL_DOWN			= (1 << 5),
+	GPIO_PULL_DISABLE		= (1 << 6),
 
 	GPIO_LOOKUP_FLAGS_DEFAULT	= GPIO_ACTIVE_HIGH | GPIO_PERSISTENT,
 };
-- 
cgit v1.2.3


From 31bea23119cda87088c6bd4085a1e442c6c5974c Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Wed, 13 Jul 2022 15:14:19 +0200
Subject: gpiolib: of: support bias pull disable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On top of looking at PULL_UP and PULL_DOWN flags, also look at
PULL_DISABLE and set the appropriate GPIO flag. The GPIO core will then
pass down this to controllers that support it.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpiolib-of.c | 7 +++++++
 include/linux/of_gpio.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index f80307be37d5..a037b50bef33 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -354,6 +354,9 @@ struct gpio_desc *gpiod_get_from_of_node(const struct device_node *node,
 	if (flags & OF_GPIO_PULL_DOWN)
 		lflags |= GPIO_PULL_DOWN;
 
+	if (flags & OF_GPIO_PULL_DISABLE)
+		lflags |= GPIO_PULL_DISABLE;
+
 	ret = gpiod_configure_flags(desc, propname, lflags, dflags);
 	if (ret < 0) {
 		gpiod_put(desc);
@@ -556,6 +559,8 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 		*flags |= GPIO_PULL_UP;
 	if (of_flags & OF_GPIO_PULL_DOWN)
 		*flags |= GPIO_PULL_DOWN;
+	if (of_flags & OF_GPIO_PULL_DISABLE)
+		*flags |= GPIO_PULL_DISABLE;
 
 	return desc;
 }
@@ -621,6 +626,8 @@ static struct gpio_desc *of_parse_own_gpio(struct device_node *np,
 		*lflags |= GPIO_PULL_UP;
 	if (xlate_flags & OF_GPIO_PULL_DOWN)
 		*lflags |= GPIO_PULL_DOWN;
+	if (xlate_flags & OF_GPIO_PULL_DISABLE)
+		*lflags |= GPIO_PULL_DISABLE;
 
 	if (of_property_read_bool(np, "input"))
 		*dflags |= GPIOD_IN;
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 8bf2ea859653..a5166eb93437 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -29,6 +29,7 @@ enum of_gpio_flags {
 	OF_GPIO_TRANSITORY = 0x8,
 	OF_GPIO_PULL_UP = 0x10,
 	OF_GPIO_PULL_DOWN = 0x20,
+	OF_GPIO_PULL_DISABLE = 0x40,
 };
 
 #ifdef CONFIG_OF_GPIO
-- 
cgit v1.2.3


From 62fa5c9800a0b9aecf9ce0743f12a16057c9400d Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 3 Jun 2022 17:57:25 +0200
Subject: mfd: max77714: Update Luca Ceresoli's e-mail address

My Bootlin address is preferred from now on.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220603155727.1232061-4-luca@lucaceresoli.net
---
 drivers/mfd/max77714.c       | 4 ++--
 include/linux/mfd/max77714.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/max77714.c b/drivers/mfd/max77714.c
index d1e4247800d2..143a432ea343 100644
--- a/drivers/mfd/max77714.c
+++ b/drivers/mfd/max77714.c
@@ -3,7 +3,7 @@
  * Maxim MAX77714 Core Driver
  *
  * Copyright (C) 2022 Luca Ceresoli
- * Author: Luca Ceresoli <luca@lucaceresoli.net>
+ * Author: Luca Ceresoli <luca.ceresoli@bootlin.com>
  */
 
 #include <linux/i2c.h>
@@ -148,5 +148,5 @@ static struct i2c_driver max77714_driver = {
 module_i2c_driver(max77714_driver);
 
 MODULE_DESCRIPTION("Maxim MAX77714 MFD core driver");
-MODULE_AUTHOR("Luca Ceresoli <luca@lucaceresoli.net>");
+MODULE_AUTHOR("Luca Ceresoli <luca.ceresoli@bootlin.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/max77714.h b/include/linux/mfd/max77714.h
index a970dc455426..7947e0d697a5 100644
--- a/include/linux/mfd/max77714.h
+++ b/include/linux/mfd/max77714.h
@@ -3,7 +3,7 @@
  * Maxim MAX77714 Register and data structures definition.
  *
  * Copyright (C) 2022 Luca Ceresoli
- * Author: Luca Ceresoli <luca@lucaceresoli.net>
+ * Author: Luca Ceresoli <luca.ceresoli@bootlin.com>
  */
 
 #ifndef __LINUX_MFD_MAX77714_H_
-- 
cgit v1.2.3


From 128ac294e1b437cb8a7f2ff8ede1cde9082bddbe Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 30 May 2022 21:24:28 +0200
Subject: mfd: t7l66xb: Drop platform disable callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

None of the in-tree instantiations of struct t7l66xb_platform_data
provides a disable callback. So better don't dereference this function
pointer unconditionally. As there is no user, drop it completely instead
of calling it conditional.

This is a preparation for making platform remove callbacks return void.

Fixes: 1f192015ca5b ("mfd: driver for the T7L66XB TMIO SoC")
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220530192430.2108217-3-u.kleine-koenig@pengutronix.de
---
 drivers/mfd/t7l66xb.c       | 6 +-----
 include/linux/mfd/t7l66xb.h | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index 5369c67e3280..663ffd4b8570 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -397,11 +397,8 @@ err_noirq:
 
 static int t7l66xb_remove(struct platform_device *dev)
 {
-	struct t7l66xb_platform_data *pdata = dev_get_platdata(&dev->dev);
 	struct t7l66xb *t7l66xb = platform_get_drvdata(dev);
-	int ret;
 
-	ret = pdata->disable(dev);
 	clk_disable_unprepare(t7l66xb->clk48m);
 	clk_put(t7l66xb->clk48m);
 	clk_disable_unprepare(t7l66xb->clk32k);
@@ -412,8 +409,7 @@ static int t7l66xb_remove(struct platform_device *dev)
 	mfd_remove_devices(&dev->dev);
 	kfree(t7l66xb);
 
-	return ret;
-
+	return 0;
 }
 
 static struct platform_driver t7l66xb_platform_driver = {
diff --git a/include/linux/mfd/t7l66xb.h b/include/linux/mfd/t7l66xb.h
index 69632c1b07bd..ae3e7a5c5219 100644
--- a/include/linux/mfd/t7l66xb.h
+++ b/include/linux/mfd/t7l66xb.h
@@ -12,7 +12,6 @@
 
 struct t7l66xb_platform_data {
 	int (*enable)(struct platform_device *dev);
-	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
 	int (*resume)(struct platform_device *dev);
 
-- 
cgit v1.2.3


From 6e1f1b1c93ceec1d9bfa9213775abc19161de5f1 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 30 May 2022 21:24:29 +0200
Subject: mfd: tc6387xb: Drop disable callback that is never called
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver never calls the disable callback, so drop the member from
the platform struct and all callbacks from the actual platform datas.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220530192430.2108217-4-u.kleine-koenig@pengutronix.de
---
 arch/arm/mach-pxa/eseries.c  | 1 -
 include/linux/mfd/tc6387xb.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index 08f8737aa8fd..99781eec065e 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -139,7 +139,6 @@ static void __init __maybe_unused eseries_register_clks(void)
 
 static struct tc6387xb_platform_data e330_tc6387xb_info = {
 	.enable   = &eseries_tmio_enable,
-	.disable  = &eseries_tmio_disable,
 	.suspend  = &eseries_tmio_suspend,
 	.resume   = &eseries_tmio_resume,
 };
diff --git a/include/linux/mfd/tc6387xb.h b/include/linux/mfd/tc6387xb.h
index b4888209494a..aacf1dcc86b9 100644
--- a/include/linux/mfd/tc6387xb.h
+++ b/include/linux/mfd/tc6387xb.h
@@ -12,7 +12,6 @@
 
 struct tc6387xb_platform_data {
 	int (*enable)(struct platform_device *dev);
-	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
 	int (*resume)(struct platform_device *dev);
 };
-- 
cgit v1.2.3


From de58cee8c6b803dda3304eace346919fe880a40a Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Tue, 31 May 2022 14:49:56 +0200
Subject: mfd: mt6397-core: Add MT6357 PMIC support

Adds support for PMIC keys, Regulator, and RTC for the MT6357 PMIC.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220531124959.202787-5-fparent@baylibre.com
---
 drivers/mfd/mt6397-core.c            |   44 +
 include/linux/mfd/mt6357/core.h      |  119 +++
 include/linux/mfd/mt6357/registers.h | 1574 ++++++++++++++++++++++++++++++++++
 3 files changed, 1737 insertions(+)
 create mode 100644 include/linux/mfd/mt6357/core.h
 create mode 100644 include/linux/mfd/mt6357/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 1a368ad08f58..3cb8836bd08d 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -12,10 +12,12 @@
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6357/core.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/registers.h>
 #include <linux/mfd/mt6397/registers.h>
@@ -23,6 +25,9 @@
 #define MT6323_RTC_BASE		0x8000
 #define MT6323_RTC_SIZE		0x40
 
+#define MT6357_RTC_BASE		0x0588
+#define MT6357_RTC_SIZE		0x3c
+
 #define MT6358_RTC_BASE		0x0588
 #define MT6358_RTC_SIZE		0x3c
 
@@ -37,6 +42,11 @@ static const struct resource mt6323_rtc_resources[] = {
 	DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC),
 };
 
+static const struct resource mt6357_rtc_resources[] = {
+	DEFINE_RES_MEM(MT6357_RTC_BASE, MT6357_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6357_IRQ_RTC),
+};
+
 static const struct resource mt6358_rtc_resources[] = {
 	DEFINE_RES_MEM(MT6358_RTC_BASE, MT6358_RTC_SIZE),
 	DEFINE_RES_IRQ(MT6358_IRQ_RTC),
@@ -66,6 +76,13 @@ static const struct resource mt6323_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6323_IRQ_STATUS_FCHRKEY, "homekey"),
 };
 
+static const struct resource mt6357_keys_resources[] = {
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_PWRKEY, "powerkey"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY, "homekey"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_PWRKEY_R, "powerkey_r"),
+	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY_R, "homekey_r"),
+};
+
 static const struct resource mt6397_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_PWRKEY, "powerkey"),
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_HOMEKEY, "homekey"),
@@ -100,6 +117,22 @@ static const struct mfd_cell mt6323_devs[] = {
 	},
 };
 
+static const struct mfd_cell mt6357_devs[] = {
+	{
+		.name = "mt6357-regulator",
+	}, {
+		.name = "mt6357-rtc",
+		.num_resources = ARRAY_SIZE(mt6357_rtc_resources),
+		.resources = mt6357_rtc_resources,
+		.of_compatible = "mediatek,mt6357-rtc",
+	}, {
+		.name = "mtk-pmic-keys",
+		.num_resources = ARRAY_SIZE(mt6357_keys_resources),
+		.resources = mt6357_keys_resources,
+		.of_compatible = "mediatek,mt6357-keys"
+	},
+};
+
 static const struct mfd_cell mt6358_devs[] = {
 	{
 		.name = "mt6358-regulator",
@@ -179,6 +212,14 @@ static const struct chip_data mt6323_core = {
 	.irq_init = mt6397_irq_init,
 };
 
+static const struct chip_data mt6357_core = {
+	.cid_addr = MT6357_SWCID,
+	.cid_shift = 8,
+	.cells = mt6357_devs,
+	.cell_size = ARRAY_SIZE(mt6357_devs),
+	.irq_init = mt6358_irq_init,
+};
+
 static const struct chip_data mt6358_core = {
 	.cid_addr = MT6358_SWCID,
 	.cid_shift = 8,
@@ -261,6 +302,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	{
 		.compatible = "mediatek,mt6323",
 		.data = &mt6323_core,
+	}, {
+		.compatible = "mediatek,mt6357",
+		.data = &mt6357_core,
 	}, {
 		.compatible = "mediatek,mt6358",
 		.data = &mt6358_core,
diff --git a/include/linux/mfd/mt6357/core.h b/include/linux/mfd/mt6357/core.h
new file mode 100644
index 000000000000..2441611264fd
--- /dev/null
+++ b/include/linux/mfd/mt6357/core.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 BayLibre, SAS
+ * Author: Fabien Parent <fparent@baylibre.com>
+ */
+
+#ifndef __MFD_MT6357_CORE_H__
+#define __MFD_MT6357_CORE_H__
+
+enum mt6357_irq_top_status_shift {
+	MT6357_BUCK_TOP = 0,
+	MT6357_LDO_TOP,
+	MT6357_PSC_TOP,
+	MT6357_SCK_TOP,
+	MT6357_BM_TOP,
+	MT6357_HK_TOP,
+	MT6357_XPP_TOP,
+	MT6357_AUD_TOP,
+	MT6357_MISC_TOP,
+};
+
+enum mt6357_irq_numbers {
+	MT6357_IRQ_VPROC_OC = 0,
+	MT6357_IRQ_VCORE_OC,
+	MT6357_IRQ_VMODEM_OC,
+	MT6357_IRQ_VS1_OC,
+	MT6357_IRQ_VPA_OC,
+	MT6357_IRQ_VCORE_PREOC,
+	MT6357_IRQ_VFE28_OC = 16,
+	MT6357_IRQ_VXO22_OC,
+	MT6357_IRQ_VRF18_OC,
+	MT6357_IRQ_VRF12_OC,
+	MT6357_IRQ_VEFUSE_OC,
+	MT6357_IRQ_VCN33_OC,
+	MT6357_IRQ_VCN28_OC,
+	MT6357_IRQ_VCN18_OC,
+	MT6357_IRQ_VCAMA_OC,
+	MT6357_IRQ_VCAMD_OC,
+	MT6357_IRQ_VCAMIO_OC,
+	MT6357_IRQ_VLDO28_OC,
+	MT6357_IRQ_VUSB33_OC,
+	MT6357_IRQ_VAUX18_OC,
+	MT6357_IRQ_VAUD28_OC,
+	MT6357_IRQ_VIO28_OC,
+	MT6357_IRQ_VIO18_OC,
+	MT6357_IRQ_VSRAM_PROC_OC,
+	MT6357_IRQ_VSRAM_OTHERS_OC,
+	MT6357_IRQ_VIBR_OC,
+	MT6357_IRQ_VDRAM_OC,
+	MT6357_IRQ_VMC_OC,
+	MT6357_IRQ_VMCH_OC,
+	MT6357_IRQ_VEMC_OC,
+	MT6357_IRQ_VSIM1_OC,
+	MT6357_IRQ_VSIM2_OC,
+	MT6357_IRQ_PWRKEY = 48,
+	MT6357_IRQ_HOMEKEY,
+	MT6357_IRQ_PWRKEY_R,
+	MT6357_IRQ_HOMEKEY_R,
+	MT6357_IRQ_NI_LBAT_INT,
+	MT6357_IRQ_CHRDET,
+	MT6357_IRQ_CHRDET_EDGE,
+	MT6357_IRQ_VCDT_HV_DET,
+	MT6357_IRQ_WATCHDOG,
+	MT6357_IRQ_VBATON_UNDET,
+	MT6357_IRQ_BVALID_DET,
+	MT6357_IRQ_OV,
+	MT6357_IRQ_RTC = 64,
+	MT6357_IRQ_FG_BAT0_H = 80,
+	MT6357_IRQ_FG_BAT0_L,
+	MT6357_IRQ_FG_CUR_H,
+	MT6357_IRQ_FG_CUR_L,
+	MT6357_IRQ_FG_ZCV,
+	MT6357_IRQ_BATON_LV = 96,
+	MT6357_IRQ_BATON_HT,
+	MT6357_IRQ_BAT_H = 112,
+	MT6357_IRQ_BAT_L,
+	MT6357_IRQ_AUXADC_IMP,
+	MT6357_IRQ_NAG_C_DLTV,
+	MT6357_IRQ_AUDIO = 128,
+	MT6357_IRQ_ACCDET = 133,
+	MT6357_IRQ_ACCDET_EINT0,
+	MT6357_IRQ_ACCDET_EINT1,
+	MT6357_IRQ_SPI_CMD_ALERT = 144,
+	MT6357_IRQ_NR,
+};
+
+#define MT6357_IRQ_BUCK_BASE	MT6357_IRQ_VPROC_OC
+#define MT6357_IRQ_LDO_BASE	MT6357_IRQ_VFE28_OC
+#define MT6357_IRQ_PSC_BASE	MT6357_IRQ_PWRKEY
+#define MT6357_IRQ_SCK_BASE	MT6357_IRQ_RTC
+#define MT6357_IRQ_BM_BASE	MT6357_IRQ_FG_BAT0_H
+#define MT6357_IRQ_HK_BASE	MT6357_IRQ_BAT_H
+#define MT6357_IRQ_AUD_BASE	MT6357_IRQ_AUDIO
+#define MT6357_IRQ_MISC_BASE	MT6357_IRQ_SPI_CMD_ALERT
+
+#define MT6357_IRQ_BUCK_BITS (MT6357_IRQ_VCORE_PREOC - MT6357_IRQ_BUCK_BASE + 1)
+#define MT6357_IRQ_LDO_BITS (MT6357_IRQ_VSIM2_OC - MT6357_IRQ_LDO_BASE + 1)
+#define MT6357_IRQ_PSC_BITS (MT6357_IRQ_VCDT_HV_DET - MT6357_IRQ_PSC_BASE + 1)
+#define MT6357_IRQ_SCK_BITS (MT6357_IRQ_RTC - MT6357_IRQ_SCK_BASE + 1)
+#define MT6357_IRQ_BM_BITS (MT6357_IRQ_BATON_HT - MT6357_IRQ_BM_BASE + 1)
+#define MT6357_IRQ_HK_BITS (MT6357_IRQ_NAG_C_DLTV - MT6357_IRQ_HK_BASE + 1)
+#define MT6357_IRQ_AUD_BITS (MT6357_IRQ_ACCDET_EINT1 - MT6357_IRQ_AUD_BASE + 1)
+#define MT6357_IRQ_MISC_BITS	\
+	(MT6357_IRQ_SPI_CMD_ALERT - MT6357_IRQ_MISC_BASE + 1)
+
+#define MT6357_TOP_GEN(sp)	\
+{	\
+	.hwirq_base = MT6357_IRQ_##sp##_BASE,	\
+	.num_int_regs =	\
+		((MT6357_IRQ_##sp##_BITS - 1) /	\
+		MTK_PMIC_REG_WIDTH) + 1,	\
+	.en_reg = MT6357_##sp##_TOP_INT_CON0,	\
+	.en_reg_shift = 0x6,	\
+	.sta_reg = MT6357_##sp##_TOP_INT_STATUS0,	\
+	.sta_reg_shift = 0x2,	\
+	.top_offset = MT6357_##sp##_TOP,	\
+}
+
+#endif /* __MFD_MT6357_CORE_H__ */
diff --git a/include/linux/mfd/mt6357/registers.h b/include/linux/mfd/mt6357/registers.h
new file mode 100644
index 000000000000..e24af83b618d
--- /dev/null
+++ b/include/linux/mfd/mt6357/registers.h
@@ -0,0 +1,1574 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 MediaTek Inc.
+ */
+
+#ifndef __MFD_MT6357_REGISTERS_H__
+#define __MFD_MT6357_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6357_TOP0_ID                       0x0
+#define MT6357_TOP0_REV0                     0x2
+#define MT6357_TOP0_DSN_DBI                  0x4
+#define MT6357_TOP0_DSN_DXI                  0x6
+#define MT6357_HWCID                         0x8
+#define MT6357_SWCID                         0xa
+#define MT6357_PONSTS                        0xc
+#define MT6357_POFFSTS                       0xe
+#define MT6357_PSTSCTL                       0x10
+#define MT6357_PG_DEB_STS0                   0x12
+#define MT6357_PG_SDN_STS0                   0x14
+#define MT6357_OC_SDN_STS0                   0x16
+#define MT6357_THERMALSTATUS                 0x18
+#define MT6357_TOP_CON                       0x1a
+#define MT6357_TEST_OUT                      0x1c
+#define MT6357_TEST_CON0                     0x1e
+#define MT6357_TEST_CON1                     0x20
+#define MT6357_TESTMODE_SW                   0x22
+#define MT6357_TOPSTATUS                     0x24
+#define MT6357_TDSEL_CON                     0x26
+#define MT6357_RDSEL_CON                     0x28
+#define MT6357_SMT_CON0                      0x2a
+#define MT6357_SMT_CON1                      0x2c
+#define MT6357_TOP_RSV0                      0x2e
+#define MT6357_TOP_RSV1                      0x30
+#define MT6357_DRV_CON0                      0x32
+#define MT6357_DRV_CON1                      0x34
+#define MT6357_DRV_CON2                      0x36
+#define MT6357_DRV_CON3                      0x38
+#define MT6357_FILTER_CON0                   0x3a
+#define MT6357_FILTER_CON1                   0x3c
+#define MT6357_FILTER_CON2                   0x3e
+#define MT6357_FILTER_CON3                   0x40
+#define MT6357_TOP_STATUS                    0x42
+#define MT6357_TOP_STATUS_SET                0x44
+#define MT6357_TOP_STATUS_CLR                0x46
+#define MT6357_TOP_TRAP                      0x48
+#define MT6357_TOP1_ID                       0x80
+#define MT6357_TOP1_REV0                     0x82
+#define MT6357_TOP1_DSN_DBI                  0x84
+#define MT6357_TOP1_DSN_DXI                  0x86
+#define MT6357_GPIO_DIR0                     0x88
+#define MT6357_GPIO_DIR0_SET                 0x8a
+#define MT6357_GPIO_DIR0_CLR                 0x8c
+#define MT6357_GPIO_PULLEN0                  0x8e
+#define MT6357_GPIO_PULLEN0_SET              0x90
+#define MT6357_GPIO_PULLEN0_CLR              0x92
+#define MT6357_GPIO_PULLSEL0                 0x94
+#define MT6357_GPIO_PULLSEL0_SET             0x96
+#define MT6357_GPIO_PULLSEL0_CLR             0x98
+#define MT6357_GPIO_DINV0                    0x9a
+#define MT6357_GPIO_DINV0_SET                0x9c
+#define MT6357_GPIO_DINV0_CLR                0x9e
+#define MT6357_GPIO_DOUT0                    0xa0
+#define MT6357_GPIO_DOUT0_SET                0xa2
+#define MT6357_GPIO_DOUT0_CLR                0xa4
+#define MT6357_GPIO_PI0                      0xa6
+#define MT6357_GPIO_POE0                     0xa8
+#define MT6357_GPIO_MODE0                    0xaa
+#define MT6357_GPIO_MODE0_SET                0xac
+#define MT6357_GPIO_MODE0_CLR                0xae
+#define MT6357_GPIO_MODE1                    0xb0
+#define MT6357_GPIO_MODE1_SET                0xb2
+#define MT6357_GPIO_MODE1_CLR                0xb4
+#define MT6357_GPIO_MODE2                    0xb6
+#define MT6357_GPIO_MODE2_SET                0xb8
+#define MT6357_GPIO_MODE2_CLR                0xba
+#define MT6357_GPIO_MODE3                    0xbc
+#define MT6357_GPIO_MODE3_SET                0xbe
+#define MT6357_GPIO_MODE3_CLR                0xc0
+#define MT6357_GPIO_RSV                      0xc2
+#define MT6357_TOP2_ID                       0x100
+#define MT6357_TOP2_REV0                     0x102
+#define MT6357_TOP2_DSN_DBI                  0x104
+#define MT6357_TOP2_DSN_DXI                  0x106
+#define MT6357_TOP_PAM0                      0x108
+#define MT6357_TOP_PAM1                      0x10a
+#define MT6357_TOP_CKPDN_CON0                0x10c
+#define MT6357_TOP_CKPDN_CON0_SET            0x10e
+#define MT6357_TOP_CKPDN_CON0_CLR            0x110
+#define MT6357_TOP_CKPDN_CON1                0x112
+#define MT6357_TOP_CKPDN_CON1_SET            0x114
+#define MT6357_TOP_CKPDN_CON1_CLR            0x116
+#define MT6357_TOP_CKSEL_CON0                0x118
+#define MT6357_TOP_CKSEL_CON0_SET            0x11a
+#define MT6357_TOP_CKSEL_CON0_CLR            0x11c
+#define MT6357_TOP_CKSEL_CON1                0x11e
+#define MT6357_TOP_CKSEL_CON1_SET            0x120
+#define MT6357_TOP_CKSEL_CON1_CLR            0x122
+#define MT6357_TOP_CKDIVSEL_CON0             0x124
+#define MT6357_TOP_CKDIVSEL_CON0_SET         0x126
+#define MT6357_TOP_CKDIVSEL_CON0_CLR         0x128
+#define MT6357_TOP_CKHWEN_CON0               0x12a
+#define MT6357_TOP_CKHWEN_CON0_SET           0x12c
+#define MT6357_TOP_CKHWEN_CON0_CLR           0x12e
+#define MT6357_TOP_CKTST_CON0                0x130
+#define MT6357_TOP_CKTST_CON1                0x132
+#define MT6357_TOP_CLK_CON0                  0x134
+#define MT6357_TOP_CLK_CON0_SET              0x136
+#define MT6357_TOP_CLK_CON0_CLR              0x138
+#define MT6357_TOP_DCM_CON0                  0x13a
+#define MT6357_TOP_HANDOVER_DEBUG0           0x13c
+#define MT6357_TOP_RST_CON0                  0x13e
+#define MT6357_TOP_RST_CON0_SET              0x140
+#define MT6357_TOP_RST_CON0_CLR              0x142
+#define MT6357_TOP_RST_CON1                  0x144
+#define MT6357_TOP_RST_CON1_SET              0x146
+#define MT6357_TOP_RST_CON1_CLR              0x148
+#define MT6357_TOP_RST_CON2                  0x14a
+#define MT6357_TOP_RST_MISC                  0x14c
+#define MT6357_TOP_RST_MISC_SET              0x14e
+#define MT6357_TOP_RST_MISC_CLR              0x150
+#define MT6357_TOP_RST_STATUS                0x152
+#define MT6357_TOP_RST_STATUS_SET            0x154
+#define MT6357_TOP_RST_STATUS_CLR            0x156
+#define MT6357_TOP2_ELR_NUM                  0x158
+#define MT6357_TOP2_ELR0                     0x15a
+#define MT6357_TOP2_ELR1                     0x15c
+#define MT6357_TOP3_ID                       0x180
+#define MT6357_TOP3_REV0                     0x182
+#define MT6357_TOP3_DSN_DBI                  0x184
+#define MT6357_TOP3_DSN_DXI                  0x186
+#define MT6357_MISC_TOP_INT_CON0             0x188
+#define MT6357_MISC_TOP_INT_CON0_SET         0x18a
+#define MT6357_MISC_TOP_INT_CON0_CLR         0x18c
+#define MT6357_MISC_TOP_INT_MASK_CON0        0x18e
+#define MT6357_MISC_TOP_INT_MASK_CON0_SET    0x190
+#define MT6357_MISC_TOP_INT_MASK_CON0_CLR    0x192
+#define MT6357_MISC_TOP_INT_STATUS0          0x194
+#define MT6357_MISC_TOP_INT_RAW_STATUS0      0x196
+#define MT6357_TOP_INT_MASK_CON0             0x198
+#define MT6357_TOP_INT_MASK_CON0_SET         0x19a
+#define MT6357_TOP_INT_MASK_CON0_CLR         0x19c
+#define MT6357_TOP_INT_STATUS0               0x19e
+#define MT6357_TOP_INT_RAW_STATUS0           0x1a0
+#define MT6357_TOP_INT_CON0                  0x1a2
+#define MT6357_PLT0_ID                       0x380
+#define MT6357_PLT0_REV0                     0x382
+#define MT6357_PLT0_REV1                     0x384
+#define MT6357_PLT0_DSN_DXI                  0x386
+#define MT6357_FQMTR_CON0                    0x388
+#define MT6357_FQMTR_CON1                    0x38a
+#define MT6357_FQMTR_CON2                    0x38c
+#define MT6357_TOP_CLK_TRIM                  0x38e
+#define MT6357_OTP_CON0                      0x390
+#define MT6357_OTP_CON1                      0x392
+#define MT6357_OTP_CON2                      0x394
+#define MT6357_OTP_CON3                      0x396
+#define MT6357_OTP_CON4                      0x398
+#define MT6357_OTP_CON5                      0x39a
+#define MT6357_OTP_CON6                      0x39c
+#define MT6357_OTP_CON7                      0x39e
+#define MT6357_OTP_CON8                      0x3a0
+#define MT6357_OTP_CON9                      0x3a2
+#define MT6357_OTP_CON10                     0x3a4
+#define MT6357_OTP_CON11                     0x3a6
+#define MT6357_OTP_CON12                     0x3a8
+#define MT6357_OTP_CON13                     0x3aa
+#define MT6357_OTP_CON14                     0x3ac
+#define MT6357_TOP_TMA_KEY                   0x3ae
+#define MT6357_TOP_MDB_CONF0                 0x3b0
+#define MT6357_TOP_MDB_CONF1                 0x3b2
+#define MT6357_TOP_MDB_CONF2                 0x3b4
+#define MT6357_PLT0_ELR_NUM                  0x3b6
+#define MT6357_PLT0_ELR0                     0x3b8
+#define MT6357_PLT0_ELR1                     0x3ba
+#define MT6357_SPISLV_ID                     0x400
+#define MT6357_SPISLV_REV0                   0x402
+#define MT6357_SPISLV_REV1                   0x404
+#define MT6357_SPISLV_DSN_DXI                0x406
+#define MT6357_RG_SPI_CON0                   0x408
+#define MT6357_DEW_DIO_EN                    0x40a
+#define MT6357_DEW_READ_TEST                 0x40c
+#define MT6357_DEW_WRITE_TEST                0x40e
+#define MT6357_DEW_CRC_SWRST                 0x410
+#define MT6357_DEW_CRC_EN                    0x412
+#define MT6357_DEW_CRC_VAL                   0x414
+#define MT6357_DEW_DBG_MON_SEL               0x416
+#define MT6357_DEW_CIPHER_KEY_SEL            0x418
+#define MT6357_DEW_CIPHER_IV_SEL             0x41a
+#define MT6357_DEW_CIPHER_EN                 0x41c
+#define MT6357_DEW_CIPHER_RDY                0x41e
+#define MT6357_DEW_CIPHER_MODE               0x420
+#define MT6357_DEW_CIPHER_SWRST              0x422
+#define MT6357_DEW_RDDMY_NO                  0x424
+#define MT6357_INT_TYPE_CON0                 0x426
+#define MT6357_INT_TYPE_CON0_SET             0x428
+#define MT6357_INT_TYPE_CON0_CLR             0x42a
+#define MT6357_INT_STA                       0x42c
+#define MT6357_RG_SPI_CON1                   0x42e
+#define MT6357_RG_SPI_CON2                   0x430
+#define MT6357_RG_SPI_CON3                   0x432
+#define MT6357_RG_SPI_CON4                   0x434
+#define MT6357_RG_SPI_CON5                   0x436
+#define MT6357_RG_SPI_CON6                   0x438
+#define MT6357_RG_SPI_CON7                   0x43a
+#define MT6357_RG_SPI_CON8                   0x43c
+#define MT6357_RG_SPI_CON9                   0x43e
+#define MT6357_RG_SPI_CON10                  0x440
+#define MT6357_RG_SPI_CON11                  0x442
+#define MT6357_RG_SPI_CON12                  0x444
+#define MT6357_RG_SPI_CON13                  0x446
+#define MT6357_TOP_SPI_CON0                  0x448
+#define MT6357_TOP_SPI_CON1                  0x44a
+#define MT6357_SCK_TOP_DSN_ID                0x500
+#define MT6357_SCK_TOP_DSN_REV0              0x502
+#define MT6357_SCK_TOP_DBI                   0x504
+#define MT6357_SCK_TOP_DXI                   0x506
+#define MT6357_SCK_TOP_TPM0                  0x508
+#define MT6357_SCK_TOP_TPM1                  0x50a
+#define MT6357_SCK_TOP_CON0                  0x50c
+#define MT6357_SCK_TOP_CON1                  0x50e
+#define MT6357_SCK_TOP_TEST_OUT              0x510
+#define MT6357_SCK_TOP_TEST_CON0             0x512
+#define MT6357_SCK_TOP_CKPDN_CON0            0x514
+#define MT6357_SCK_TOP_CKPDN_CON0_SET        0x516
+#define MT6357_SCK_TOP_CKPDN_CON0_CLR        0x518
+#define MT6357_SCK_TOP_CKHWEN_CON0           0x51a
+#define MT6357_SCK_TOP_CKHWEN_CON0_SET       0x51c
+#define MT6357_SCK_TOP_CKHWEN_CON0_CLR       0x51e
+#define MT6357_SCK_TOP_CKTST_CON             0x520
+#define MT6357_SCK_TOP_RST_CON0              0x522
+#define MT6357_SCK_TOP_RST_CON0_SET          0x524
+#define MT6357_SCK_TOP_RST_CON0_CLR          0x526
+#define MT6357_SCK_TOP_INT_CON0              0x528
+#define MT6357_SCK_TOP_INT_CON0_SET          0x52a
+#define MT6357_SCK_TOP_INT_CON0_CLR          0x52c
+#define MT6357_SCK_TOP_INT_MASK_CON0         0x52e
+#define MT6357_SCK_TOP_INT_MASK_CON0_SET     0x530
+#define MT6357_SCK_TOP_INT_MASK_CON0_CLR     0x532
+#define MT6357_SCK_TOP_INT_STATUS0           0x534
+#define MT6357_SCK_TOP_INT_RAW_STATUS0       0x536
+#define MT6357_SCK_TOP_INT_MISC_CON          0x538
+#define MT6357_EOSC_CALI_CON0                0x53a
+#define MT6357_EOSC_CALI_CON1                0x53c
+#define MT6357_RTC_MIX_CON0                  0x53e
+#define MT6357_RTC_MIX_CON1                  0x540
+#define MT6357_RTC_MIX_CON2                  0x542
+#define MT6357_RTC_DSN_ID                    0x580
+#define MT6357_RTC_DSN_REV0                  0x582
+#define MT6357_RTC_DBI                       0x584
+#define MT6357_RTC_DXI                       0x586
+#define MT6357_RTC_BBPU                      0x588
+#define MT6357_RTC_IRQ_STA                   0x58a
+#define MT6357_RTC_IRQ_EN                    0x58c
+#define MT6357_RTC_CII_EN                    0x58e
+#define MT6357_RTC_AL_MASK                   0x590
+#define MT6357_RTC_TC_SEC                    0x592
+#define MT6357_RTC_TC_MIN                    0x594
+#define MT6357_RTC_TC_HOU                    0x596
+#define MT6357_RTC_TC_DOM                    0x598
+#define MT6357_RTC_TC_DOW                    0x59a
+#define MT6357_RTC_TC_MTH                    0x59c
+#define MT6357_RTC_TC_YEA                    0x59e
+#define MT6357_RTC_AL_SEC                    0x5a0
+#define MT6357_RTC_AL_MIN                    0x5a2
+#define MT6357_RTC_AL_HOU                    0x5a4
+#define MT6357_RTC_AL_DOM                    0x5a6
+#define MT6357_RTC_AL_DOW                    0x5a8
+#define MT6357_RTC_AL_MTH                    0x5aa
+#define MT6357_RTC_AL_YEA                    0x5ac
+#define MT6357_RTC_OSC32CON                  0x5ae
+#define MT6357_RTC_POWERKEY1                 0x5b0
+#define MT6357_RTC_POWERKEY2                 0x5b2
+#define MT6357_RTC_PDN1                      0x5b4
+#define MT6357_RTC_PDN2                      0x5b6
+#define MT6357_RTC_SPAR0                     0x5b8
+#define MT6357_RTC_SPAR1                     0x5ba
+#define MT6357_RTC_PROT                      0x5bc
+#define MT6357_RTC_DIFF                      0x5be
+#define MT6357_RTC_CALI                      0x5c0
+#define MT6357_RTC_WRTGR                     0x5c2
+#define MT6357_RTC_CON                       0x5c4
+#define MT6357_RTC_SEC_CTRL                  0x5c6
+#define MT6357_RTC_INT_CNT                   0x5c8
+#define MT6357_RTC_SEC_DAT0                  0x5ca
+#define MT6357_RTC_SEC_DAT1                  0x5cc
+#define MT6357_RTC_SEC_DAT2                  0x5ce
+#define MT6357_RTC_SEC_DSN_ID                0x600
+#define MT6357_RTC_SEC_DSN_REV0              0x602
+#define MT6357_RTC_SEC_DBI                   0x604
+#define MT6357_RTC_SEC_DXI                   0x606
+#define MT6357_RTC_TC_SEC_SEC                0x608
+#define MT6357_RTC_TC_MIN_SEC                0x60a
+#define MT6357_RTC_TC_HOU_SEC                0x60c
+#define MT6357_RTC_TC_DOM_SEC                0x60e
+#define MT6357_RTC_TC_DOW_SEC                0x610
+#define MT6357_RTC_TC_MTH_SEC                0x612
+#define MT6357_RTC_TC_YEA_SEC                0x614
+#define MT6357_RTC_SEC_CK_PDN                0x616
+#define MT6357_RTC_SEC_WRTGR                 0x618
+#define MT6357_DCXO_DSN_ID                   0x780
+#define MT6357_DCXO_DSN_REV0                 0x782
+#define MT6357_DCXO_DSN_DBI                  0x784
+#define MT6357_DCXO_DSN_DXI                  0x786
+#define MT6357_DCXO_CW00                     0x788
+#define MT6357_DCXO_CW00_SET                 0x78a
+#define MT6357_DCXO_CW00_CLR                 0x78c
+#define MT6357_DCXO_CW01                     0x78e
+#define MT6357_DCXO_CW02                     0x790
+#define MT6357_DCXO_CW03                     0x792
+#define MT6357_DCXO_CW04                     0x794
+#define MT6357_DCXO_CW05                     0x796
+#define MT6357_DCXO_CW06                     0x798
+#define MT6357_DCXO_CW07                     0x79a
+#define MT6357_DCXO_CW08                     0x79c
+#define MT6357_DCXO_CW09                     0x79e
+#define MT6357_DCXO_CW10                     0x7a0
+#define MT6357_DCXO_CW11                     0x7a2
+#define MT6357_DCXO_CW11_SET                 0x7a4
+#define MT6357_DCXO_CW11_CLR                 0x7a6
+#define MT6357_DCXO_CW12                     0x7a8
+#define MT6357_DCXO_CW13                     0x7aa
+#define MT6357_DCXO_CW14                     0x7ac
+#define MT6357_DCXO_CW15                     0x7ae
+#define MT6357_DCXO_CW16                     0x7b0
+#define MT6357_DCXO_CW17                     0x7b2
+#define MT6357_DCXO_CW18                     0x7b4
+#define MT6357_DCXO_CW19                     0x7b6
+#define MT6357_DCXO_CW20                     0x7b8
+#define MT6357_DCXO_CW21                     0x7ba
+#define MT6357_DCXO_CW22                     0x7bc
+#define MT6357_DCXO_ELR_NUM                  0x7be
+#define MT6357_DCXO_ELR0                     0x7c0
+#define MT6357_PSC_TOP_ID                    0x900
+#define MT6357_PSC_TOP_REV0                  0x902
+#define MT6357_PSC_TOP_DBI                   0x904
+#define MT6357_PSC_TOP_DXI                   0x906
+#define MT6357_PSC_TPM0                      0x908
+#define MT6357_PSC_TPM1                      0x90a
+#define MT6357_PSC_TOP_RSTCTL_0              0x90c
+#define MT6357_PSC_TOP_INT_CON0              0x90e
+#define MT6357_PSC_TOP_INT_CON0_SET          0x910
+#define MT6357_PSC_TOP_INT_CON0_CLR          0x912
+#define MT6357_PSC_TOP_INT_MASK_CON0         0x914
+#define MT6357_PSC_TOP_INT_MASK_CON0_SET     0x916
+#define MT6357_PSC_TOP_INT_MASK_CON0_CLR     0x918
+#define MT6357_PSC_TOP_INT_STATUS0           0x91a
+#define MT6357_PSC_TOP_INT_RAW_STATUS0       0x91c
+#define MT6357_PSC_TOP_INT_MISC_CON          0x91e
+#define MT6357_PSC_TOP_INT_MISC_CON_SET      0x920
+#define MT6357_PSC_TOP_INT_MISC_CON_CLR      0x922
+#define MT6357_PSC_TOP_MON_CTL               0x924
+#define MT6357_STRUP_ID                      0x980
+#define MT6357_STRUP_REV0                    0x982
+#define MT6357_STRUP_DBI                     0x984
+#define MT6357_STRUP_DXI                     0x986
+#define MT6357_STRUP_ANA_CON0                0x988
+#define MT6357_STRUP_ANA_CON1                0x98a
+#define MT6357_STRUP_ANA_CON2                0x98c
+#define MT6357_STRUP_ELR_NUM                 0x98e
+#define MT6357_STRUP_ELR_0                   0x990
+#define MT6357_PSEQ_ID                       0xa00
+#define MT6357_PSEQ_REV0                     0xa02
+#define MT6357_PSEQ_DBI                      0xa04
+#define MT6357_PSEQ_DXI                      0xa06
+#define MT6357_PPCCTL0                       0xa08
+#define MT6357_PPCCTL1                       0xa0a
+#define MT6357_PPCCTL2                       0xa0c
+#define MT6357_PPCCFG0                       0xa0e
+#define MT6357_PPCTST0                       0xa10
+#define MT6357_PORFLAG                       0xa12
+#define MT6357_STRUP_CON0                    0xa14
+#define MT6357_STRUP_CON1                    0xa16
+#define MT6357_STRUP_CON2                    0xa18
+#define MT6357_STRUP_CON3                    0xa1a
+#define MT6357_STRUP_CON4                    0xa1c
+#define MT6357_STRUP_CON5                    0xa1e
+#define MT6357_STRUP_CON6                    0xa20
+#define MT6357_STRUP_CON7                    0xa22
+#define MT6357_CPSCFG0                       0xa24
+#define MT6357_STRUP_CON9                    0xa26
+#define MT6357_STRUP_CON10                   0xa28
+#define MT6357_STRUP_CON11                   0xa2a
+#define MT6357_STRUP_CON12                   0xa2c
+#define MT6357_STRUP_CON13                   0xa2e
+#define MT6357_STRUP_CON14                   0xa30
+#define MT6357_STRUP_CON15                   0xa32
+#define MT6357_STRUP_CON16                   0xa34
+#define MT6357_STRUP_CON19                   0xa36
+#define MT6357_PSEQ_ELR_NUM                  0xa38
+#define MT6357_PSEQ_ELR7                     0xa3a
+#define MT6357_PSEQ_ELR8                     0xa3c
+#define MT6357_PCHR_DIG_DSN_ID               0xa80
+#define MT6357_PCHR_DIG_DSN_REV0             0xa82
+#define MT6357_PCHR_DIG_DSN_DBI              0xa84
+#define MT6357_PCHR_DIG_DSN_DXI              0xa86
+#define MT6357_CHR_TOP_CON0                  0xa88
+#define MT6357_CHR_TOP_CON1                  0xa8a
+#define MT6357_CHR_TOP_CON2                  0xa8c
+#define MT6357_CHR_TOP_CON3                  0xa8e
+#define MT6357_CHR_TOP_CON4                  0xa90
+#define MT6357_CHR_TOP_CON5                  0xa92
+#define MT6357_CHR_TOP_CON6                  0xa94
+#define MT6357_PCHR_DIG_ELR_NUM              0xa96
+#define MT6357_PCHR_ELR0                     0xa98
+#define MT6357_PCHR_ELR1                     0xa9a
+#define MT6357_PCHR_MACRO_DSN_ID             0xb80
+#define MT6357_PCHR_MACRO_DSN_REV0           0xb82
+#define MT6357_PCHR_MACRO_DSN_DBI            0xb84
+#define MT6357_PCHR_MACRO_DSN_DXI            0xb86
+#define MT6357_CHR_CON0                      0xb88
+#define MT6357_CHR_CON1                      0xb8a
+#define MT6357_CHR_CON2                      0xb8c
+#define MT6357_CHR_CON3                      0xb8e
+#define MT6357_CHR_CON4                      0xb90
+#define MT6357_CHR_CON5                      0xb92
+#define MT6357_CHR_CON6                      0xb94
+#define MT6357_CHR_CON7                      0xb96
+#define MT6357_CHR_CON8                      0xb98
+#define MT6357_CHR_CON9                      0xb9a
+#define MT6357_BM_TOP_DSN_ID                 0xc00
+#define MT6357_BM_TOP_DSN_REV0               0xc02
+#define MT6357_BM_TOP_DBI                    0xc04
+#define MT6357_BM_TOP_DXI                    0xc06
+#define MT6357_BM_TPM0                       0xc08
+#define MT6357_BM_TPM1                       0xc0a
+#define MT6357_BM_TOP_CKPDN_CON0             0xc0c
+#define MT6357_BM_TOP_CKPDN_CON0_SET         0xc0e
+#define MT6357_BM_TOP_CKPDN_CON0_CLR         0xc10
+#define MT6357_BM_TOP_CKSEL_CON0             0xc12
+#define MT6357_BM_TOP_CKSEL_CON0_SET         0xc14
+#define MT6357_BM_TOP_CKSEL_CON0_CLR         0xc16
+#define MT6357_BM_TOP_CKTST_CON0             0xc18
+#define MT6357_BM_TOP_RST_CON0               0xc1a
+#define MT6357_BM_TOP_RST_CON0_SET           0xc1c
+#define MT6357_BM_TOP_RST_CON0_CLR           0xc1e
+#define MT6357_BM_TOP_INT_CON0               0xc20
+#define MT6357_BM_TOP_INT_CON0_SET           0xc22
+#define MT6357_BM_TOP_INT_CON0_CLR           0xc24
+#define MT6357_BM_TOP_INT_CON1               0xc26
+#define MT6357_BM_TOP_INT_CON1_SET           0xc28
+#define MT6357_BM_TOP_INT_CON1_CLR           0xc2a
+#define MT6357_BM_TOP_INT_MASK_CON0          0xc2c
+#define MT6357_BM_TOP_INT_MASK_CON0_SET      0xc2e
+#define MT6357_BM_TOP_INT_MASK_CON0_CLR      0xc30
+#define MT6357_BM_TOP_INT_MASK_CON1          0xc32
+#define MT6357_BM_TOP_INT_MASK_CON1_SET      0xc34
+#define MT6357_BM_TOP_INT_MASK_CON1_CLR      0xc36
+#define MT6357_BM_TOP_INT_STATUS0            0xc38
+#define MT6357_BM_TOP_INT_STATUS1            0xc3a
+#define MT6357_BM_TOP_INT_RAW_STATUS0        0xc3c
+#define MT6357_BM_TOP_INT_RAW_STATUS1        0xc3e
+#define MT6357_BM_TOP_INT_MISC_CON           0xc40
+#define MT6357_BM_TOP_DBG_CON                0xc42
+#define MT6357_BM_TOP_RSV0                   0xc44
+#define MT6357_FGADC_ANA_DSN_ID              0xc80
+#define MT6357_FGADC_ANA_DSN_REV0            0xc82
+#define MT6357_FGADC_ANA_DSN_DBI             0xc84
+#define MT6357_FGADC_ANA_DSN_DXI             0xc86
+#define MT6357_FGADC_ANA_CON0                0xc88
+#define MT6357_FGADC_ANA_TEST_CON0           0xc8a
+#define MT6357_FGADC_ANA_ELR_NUM             0xc8c
+#define MT6357_FGADC_ANA_ELR0                0xc8e
+#define MT6357_FGADC_ANA_ELR1                0xc90
+#define MT6357_FGADC0_DSN_ID                 0xd00
+#define MT6357_FGADC0_DSN_REV0               0xd02
+#define MT6357_FGADC0_DSN_DBI                0xd04
+#define MT6357_FGADC0_DSN_DXI                0xd06
+#define MT6357_FGADC_CON0                    0xd08
+#define MT6357_FGADC_CON1                    0xd0a
+#define MT6357_FGADC_CON2                    0xd0c
+#define MT6357_FGADC_CON3                    0xd0e
+#define MT6357_FGADC_CON4                    0xd10
+#define MT6357_FGADC_CAR_CON0                0xd12
+#define MT6357_FGADC_CAR_CON1                0xd14
+#define MT6357_FGADC_CAR_CON2                0xd16
+#define MT6357_FGADC_CARTH_CON0              0xd18
+#define MT6357_FGADC_CARTH_CON1              0xd1a
+#define MT6357_FGADC_CARTH_CON2              0xd1c
+#define MT6357_FGADC_CARTH_CON3              0xd1e
+#define MT6357_FGADC_NTER_CON0               0xd20
+#define MT6357_FGADC_NTER_CON1               0xd22
+#define MT6357_FGADC_NTER_CON2               0xd24
+#define MT6357_FGADC_SON_CON0                0xd26
+#define MT6357_FGADC_SON_CON1                0xd28
+#define MT6357_FGADC_SON_CON2                0xd2a
+#define MT6357_FGADC_SON_CON3                0xd2c
+#define MT6357_FGADC_ZCV_CON0                0xd2e
+#define MT6357_FGADC_ZCV_CON1                0xd30
+#define MT6357_FGADC_ZCV_CON2                0xd32
+#define MT6357_FGADC_ZCV_CON3                0xd34
+#define MT6357_FGADC_ZCV_CON4                0xd36
+#define MT6357_FGADC_ZCVTH_CON0              0xd38
+#define MT6357_FGADC_ZCVTH_CON1              0xd3a
+#define MT6357_FGADC_ZCVTH_CON2              0xd3c
+#define MT6357_FGADC1_DSN_ID                 0xd80
+#define MT6357_FGADC1_DSN_REV0               0xd82
+#define MT6357_FGADC1_DSN_DBI                0xd84
+#define MT6357_FGADC1_DSN_DXI                0xd86
+#define MT6357_FGADC_R_CON0                  0xd88
+#define MT6357_FGADC_CUR_CON0                0xd8a
+#define MT6357_FGADC_CUR_CON1                0xd8c
+#define MT6357_FGADC_CUR_CON2                0xd8e
+#define MT6357_FGADC_CUR_CON3                0xd90
+#define MT6357_FGADC_OFFSET_CON0             0xd92
+#define MT6357_FGADC_OFFSET_CON1             0xd94
+#define MT6357_FGADC_GAIN_CON0               0xd96
+#define MT6357_FGADC_TEST_CON0               0xd98
+#define MT6357_SYSTEM_INFO_CON0              0xd9a
+#define MT6357_SYSTEM_INFO_CON1              0xd9c
+#define MT6357_SYSTEM_INFO_CON2              0xd9e
+#define MT6357_SYSTEM_INFO_CON3              0xda0
+#define MT6357_SYSTEM_INFO_CON4              0xda2
+#define MT6357_BATON_ANA_DSN_ID              0xe00
+#define MT6357_BATON_ANA_DSN_REV0            0xe02
+#define MT6357_BATON_ANA_DSN_DBI             0xe04
+#define MT6357_BATON_ANA_DSN_DXI             0xe06
+#define MT6357_BATON_ANA_CON0                0xe08
+#define MT6357_BATON_ANA_ELR_NUM             0xe0a
+#define MT6357_BATON_ANA_ELR0                0xe0c
+#define MT6357_HK_TOP_ID                     0xf80
+#define MT6357_HK_TOP_REV0                   0xf82
+#define MT6357_HK_TOP_DBI                    0xf84
+#define MT6357_HK_TOP_DXI                    0xf86
+#define MT6357_HK_TPM0                       0xf88
+#define MT6357_HK_TPM1                       0xf8a
+#define MT6357_HK_TOP_CLK_CON0               0xf8c
+#define MT6357_HK_TOP_CLK_CON1               0xf8e
+#define MT6357_HK_TOP_RST_CON0               0xf90
+#define MT6357_HK_TOP_INT_CON0               0xf92
+#define MT6357_HK_TOP_INT_CON0_SET           0xf94
+#define MT6357_HK_TOP_INT_CON0_CLR           0xf96
+#define MT6357_HK_TOP_INT_MASK_CON0          0xf98
+#define MT6357_HK_TOP_INT_MASK_CON0_SET      0xf9a
+#define MT6357_HK_TOP_INT_MASK_CON0_CLR      0xf9c
+#define MT6357_HK_TOP_INT_STATUS0            0xf9e
+#define MT6357_HK_TOP_INT_RAW_STATUS0        0xfa0
+#define MT6357_HK_TOP_MON_CON0               0xfa2
+#define MT6357_HK_TOP_MON_CON1               0xfa4
+#define MT6357_HK_TOP_MON_CON2               0xfa6
+#define MT6357_AUXADC_DSN_ID                 0x1000
+#define MT6357_AUXADC_DSN_REV0               0x1002
+#define MT6357_AUXADC_DSN_DBI                0x1004
+#define MT6357_AUXADC_DSN_DXI                0x1006
+#define MT6357_AUXADC_ANA_CON0               0x1008
+#define MT6357_AUXADC_DIG_1_DSN_ID           0x1080
+#define MT6357_AUXADC_DIG_1_DSN_REV0         0x1082
+#define MT6357_AUXADC_DIG_1_DSN_DBI          0x1084
+#define MT6357_AUXADC_DIG_1_DSN_DXI          0x1086
+#define MT6357_AUXADC_ADC0                   0x1088
+#define MT6357_AUXADC_ADC1                   0x108a
+#define MT6357_AUXADC_ADC2                   0x108c
+#define MT6357_AUXADC_ADC3                   0x108e
+#define MT6357_AUXADC_ADC4                   0x1090
+#define MT6357_AUXADC_ADC5                   0x1092
+#define MT6357_AUXADC_ADC6                   0x1094
+#define MT6357_AUXADC_ADC7                   0x1096
+#define MT6357_AUXADC_ADC8                   0x1098
+#define MT6357_AUXADC_ADC9                   0x109a
+#define MT6357_AUXADC_ADC10                  0x109c
+#define MT6357_AUXADC_ADC11                  0x109e
+#define MT6357_AUXADC_ADC12                  0x10a0
+#define MT6357_AUXADC_ADC14                  0x10a2
+#define MT6357_AUXADC_ADC16                  0x10a4
+#define MT6357_AUXADC_ADC17                  0x10a6
+#define MT6357_AUXADC_ADC18                  0x10a8
+#define MT6357_AUXADC_ADC19                  0x10aa
+#define MT6357_AUXADC_ADC20                  0x10ac
+#define MT6357_AUXADC_ADC21                  0x10ae
+#define MT6357_AUXADC_ADC22                  0x10b0
+#define MT6357_AUXADC_ADC23                  0x10b2
+#define MT6357_AUXADC_ADC24                  0x10b4
+#define MT6357_AUXADC_ADC25                  0x10b6
+#define MT6357_AUXADC_ADC26                  0x10b8
+#define MT6357_AUXADC_ADC27                  0x10ba
+#define MT6357_AUXADC_ADC29                  0x10bc
+#define MT6357_AUXADC_ADC30                  0x10be
+#define MT6357_AUXADC_ADC31                  0x10c0
+#define MT6357_AUXADC_ADC32                  0x10c2
+#define MT6357_AUXADC_ADC33                  0x10c4
+#define MT6357_AUXADC_ADC34                  0x10c6
+#define MT6357_AUXADC_ADC35                  0x10c8
+#define MT6357_AUXADC_ADC36                  0x10ca
+#define MT6357_AUXADC_ADC38                  0x10cc
+#define MT6357_AUXADC_ADC39                  0x10ce
+#define MT6357_AUXADC_ADC40                  0x10d0
+#define MT6357_AUXADC_ADC41                  0x10d2
+#define MT6357_AUXADC_ADC42                  0x10d4
+#define MT6357_AUXADC_ADC43                  0x10d6
+#define MT6357_AUXADC_ADC46                  0x10d8
+#define MT6357_AUXADC_ADC47                  0x10da
+#define MT6357_AUXADC_DIG_1_ELR_NUM          0x10dc
+#define MT6357_AUXADC_DIG_1_ELR0             0x10de
+#define MT6357_AUXADC_DIG_1_ELR1             0x10e0
+#define MT6357_AUXADC_DIG_2_DSN_ID           0x1100
+#define MT6357_AUXADC_DIG_2_DSN_REV0         0x1102
+#define MT6357_AUXADC_DIG_2_DSN_DBI          0x1104
+#define MT6357_AUXADC_DIG_2_DSN_DXI          0x1106
+#define MT6357_AUXADC_STA0                   0x1108
+#define MT6357_AUXADC_STA1                   0x110a
+#define MT6357_AUXADC_STA2                   0x110c
+#define MT6357_AUXADC_RQST0                  0x110e
+#define MT6357_AUXADC_RQST0_SET              0x1110
+#define MT6357_AUXADC_RQST0_CLR              0x1112
+#define MT6357_AUXADC_RQST2                  0x1114
+#define MT6357_AUXADC_RQST2_SET              0x1116
+#define MT6357_AUXADC_RQST2_CLR              0x1118
+#define MT6357_AUXADC_RQST1                  0x111a
+#define MT6357_AUXADC_RQST1_SET              0x111c
+#define MT6357_AUXADC_RQST1_CLR              0x111e
+#define MT6357_AUXADC_CON0                   0x1120
+#define MT6357_AUXADC_CON0_SET               0x1122
+#define MT6357_AUXADC_CON0_CLR               0x1124
+#define MT6357_AUXADC_CON1                   0x1126
+#define MT6357_AUXADC_CON2                   0x1128
+#define MT6357_AUXADC_CON3                   0x112a
+#define MT6357_AUXADC_CON4                   0x112c
+#define MT6357_AUXADC_CON5                   0x112e
+#define MT6357_AUXADC_CON6                   0x1130
+#define MT6357_AUXADC_CON7                   0x1132
+#define MT6357_AUXADC_CON8                   0x1134
+#define MT6357_AUXADC_CON9                   0x1136
+#define MT6357_AUXADC_CON10                  0x1138
+#define MT6357_AUXADC_CON11                  0x113a
+#define MT6357_AUXADC_CON12                  0x113c
+#define MT6357_AUXADC_CON13                  0x113e
+#define MT6357_AUXADC_CON14                  0x1140
+#define MT6357_AUXADC_CON15                  0x1142
+#define MT6357_AUXADC_CON16                  0x1144
+#define MT6357_AUXADC_CON17                  0x1146
+#define MT6357_AUXADC_CON18                  0x1148
+#define MT6357_AUXADC_CON19                  0x114a
+#define MT6357_AUXADC_CON20                  0x114c
+#define MT6357_AUXADC_DIG_3_DSN_ID           0x1180
+#define MT6357_AUXADC_DIG_3_DSN_REV0         0x1182
+#define MT6357_AUXADC_DIG_3_DSN_DBI          0x1184
+#define MT6357_AUXADC_DIG_3_DSN_DXI          0x1186
+#define MT6357_AUXADC_AUTORPT0               0x1188
+#define MT6357_AUXADC_LBAT0                  0x118a
+#define MT6357_AUXADC_LBAT1                  0x118c
+#define MT6357_AUXADC_LBAT2                  0x118e
+#define MT6357_AUXADC_LBAT3                  0x1190
+#define MT6357_AUXADC_LBAT4                  0x1192
+#define MT6357_AUXADC_LBAT5                  0x1194
+#define MT6357_AUXADC_LBAT6                  0x1196
+#define MT6357_AUXADC_ACCDET                 0x1198
+#define MT6357_AUXADC_DBG0                   0x119a
+#define MT6357_AUXADC_IMP0                   0x119c
+#define MT6357_AUXADC_IMP1                   0x119e
+#define MT6357_AUXADC_DIG_3_ELR_NUM          0x11a0
+#define MT6357_AUXADC_DIG_3_ELR0             0x11a2
+#define MT6357_AUXADC_DIG_3_ELR1             0x11a4
+#define MT6357_AUXADC_DIG_3_ELR2             0x11a6
+#define MT6357_AUXADC_DIG_3_ELR3             0x11a8
+#define MT6357_AUXADC_DIG_3_ELR4             0x11aa
+#define MT6357_AUXADC_DIG_3_ELR5             0x11ac
+#define MT6357_AUXADC_DIG_3_ELR6             0x11ae
+#define MT6357_AUXADC_DIG_3_ELR7             0x11b0
+#define MT6357_AUXADC_DIG_3_ELR8             0x11b2
+#define MT6357_AUXADC_DIG_3_ELR9             0x11b4
+#define MT6357_AUXADC_DIG_3_ELR10            0x11b6
+#define MT6357_AUXADC_DIG_3_ELR11            0x11b8
+#define MT6357_AUXADC_DIG_4_DSN_ID           0x1200
+#define MT6357_AUXADC_DIG_4_DSN_REV0         0x1202
+#define MT6357_AUXADC_DIG_4_DSN_DBI          0x1204
+#define MT6357_AUXADC_DIG_4_DSN_DXI          0x1206
+#define MT6357_AUXADC_MDRT_0                 0x1208
+#define MT6357_AUXADC_MDRT_1                 0x120a
+#define MT6357_AUXADC_MDRT_2                 0x120c
+#define MT6357_AUXADC_MDRT_3                 0x120e
+#define MT6357_AUXADC_MDRT_4                 0x1210
+#define MT6357_AUXADC_DCXO_MDRT_0            0x1212
+#define MT6357_AUXADC_DCXO_MDRT_1            0x1214
+#define MT6357_AUXADC_DCXO_MDRT_2            0x1216
+#define MT6357_AUXADC_NAG_0                  0x1218
+#define MT6357_AUXADC_NAG_1                  0x121a
+#define MT6357_AUXADC_NAG_2                  0x121c
+#define MT6357_AUXADC_NAG_3                  0x121e
+#define MT6357_AUXADC_NAG_4                  0x1220
+#define MT6357_AUXADC_NAG_5                  0x1222
+#define MT6357_AUXADC_NAG_6                  0x1224
+#define MT6357_AUXADC_NAG_7                  0x1226
+#define MT6357_AUXADC_NAG_8                  0x1228
+#define MT6357_AUXADC_RSV_1                  0x122a
+#define MT6357_AUXADC_ANA_0                  0x122c
+#define MT6357_AUXADC_IMP_CG0                0x122e
+#define MT6357_AUXADC_LBAT_CG0               0x1230
+#define MT6357_AUXADC_NAG_CG0                0x1232
+#define MT6357_AUXADC_PRI_NEW                0x1234
+#define MT6357_AUXADC_CHR_TOP_CON2           0x1236
+#define MT6357_BUCK_TOP_DSN_ID               0x1400
+#define MT6357_BUCK_TOP_DSN_REV0             0x1402
+#define MT6357_BUCK_TOP_DBI                  0x1404
+#define MT6357_BUCK_TOP_DXI                  0x1406
+#define MT6357_BUCK_TOP_PAM0                 0x1408
+#define MT6357_BUCK_TOP_PAM1                 0x140a
+#define MT6357_BUCK_TOP_CLK_CON0             0x140c
+#define MT6357_BUCK_TOP_CLK_CON0_SET         0x140e
+#define MT6357_BUCK_TOP_CLK_CON0_CLR         0x1410
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0        0x1412
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0_SET    0x1414
+#define MT6357_BUCK_TOP_CLK_HWEN_CON0_CLR    0x1416
+#define MT6357_BUCK_TOP_CLK_MISC_CON0        0x1418
+#define MT6357_BUCK_TOP_INT_CON0             0x141a
+#define MT6357_BUCK_TOP_INT_CON0_SET         0x141c
+#define MT6357_BUCK_TOP_INT_CON0_CLR         0x141e
+#define MT6357_BUCK_TOP_INT_MASK_CON0        0x1420
+#define MT6357_BUCK_TOP_INT_MASK_CON0_SET    0x1422
+#define MT6357_BUCK_TOP_INT_MASK_CON0_CLR    0x1424
+#define MT6357_BUCK_TOP_INT_STATUS0          0x1426
+#define MT6357_BUCK_TOP_INT_RAW_STATUS0      0x1428
+#define MT6357_BUCK_TOP_STB_CON              0x142a
+#define MT6357_BUCK_TOP_SLP_CON0             0x142c
+#define MT6357_BUCK_TOP_SLP_CON1             0x142e
+#define MT6357_BUCK_TOP_SLP_CON2             0x1430
+#define MT6357_BUCK_TOP_MINFREQ_CON          0x1432
+#define MT6357_BUCK_TOP_OC_CON0              0x1434
+#define MT6357_BUCK_TOP_K_CON0               0x1436
+#define MT6357_BUCK_TOP_K_CON1               0x1438
+#define MT6357_BUCK_TOP_K_CON2               0x143a
+#define MT6357_BUCK_TOP_WDTDBG0              0x143c
+#define MT6357_BUCK_TOP_WDTDBG1              0x143e
+#define MT6357_BUCK_TOP_WDTDBG2              0x1440
+#define MT6357_BUCK_TOP_ELR_NUM              0x1442
+#define MT6357_BUCK_TOP_ELR0                 0x1444
+#define MT6357_BUCK_TOP_ELR1                 0x1446
+#define MT6357_BUCK_VPROC_DSN_ID             0x1480
+#define MT6357_BUCK_VPROC_DSN_REV0           0x1482
+#define MT6357_BUCK_VPROC_DSN_DBI            0x1484
+#define MT6357_BUCK_VPROC_DSN_DXI            0x1486
+#define MT6357_BUCK_VPROC_CON0               0x1488
+#define MT6357_BUCK_VPROC_CON1               0x148a
+#define MT6357_BUCK_VPROC_CFG0               0x148c
+#define MT6357_BUCK_VPROC_CFG1               0x148e
+#define MT6357_BUCK_VPROC_OP_EN              0x1490
+#define MT6357_BUCK_VPROC_OP_EN_SET          0x1492
+#define MT6357_BUCK_VPROC_OP_EN_CLR          0x1494
+#define MT6357_BUCK_VPROC_OP_CFG             0x1496
+#define MT6357_BUCK_VPROC_OP_CFG_SET         0x1498
+#define MT6357_BUCK_VPROC_OP_CFG_CLR         0x149a
+#define MT6357_BUCK_VPROC_SP_CON             0x149c
+#define MT6357_BUCK_VPROC_SP_CFG             0x149e
+#define MT6357_BUCK_VPROC_OC_CFG             0x14a0
+#define MT6357_BUCK_VPROC_DBG0               0x14a2
+#define MT6357_BUCK_VPROC_DBG1               0x14a4
+#define MT6357_BUCK_VPROC_DBG2               0x14a6
+#define MT6357_BUCK_VPROC_ELR_NUM            0x14a8
+#define MT6357_BUCK_VPROC_ELR0               0x14aa
+#define MT6357_BUCK_VCORE_DSN_ID             0x1500
+#define MT6357_BUCK_VCORE_DSN_REV0           0x1502
+#define MT6357_BUCK_VCORE_DSN_DBI            0x1504
+#define MT6357_BUCK_VCORE_DSN_DXI            0x1506
+#define MT6357_BUCK_VCORE_CON0               0x1508
+#define MT6357_BUCK_VCORE_CON1               0x150a
+#define MT6357_BUCK_VCORE_CFG0               0x150c
+#define MT6357_BUCK_VCORE_CFG1               0x150e
+#define MT6357_BUCK_VCORE_OP_EN              0x1510
+#define MT6357_BUCK_VCORE_OP_EN_SET          0x1512
+#define MT6357_BUCK_VCORE_OP_EN_CLR          0x1514
+#define MT6357_BUCK_VCORE_OP_CFG             0x1516
+#define MT6357_BUCK_VCORE_OP_CFG_SET         0x1518
+#define MT6357_BUCK_VCORE_OP_CFG_CLR         0x151a
+#define MT6357_BUCK_VCORE_SP_CON             0x151c
+#define MT6357_BUCK_VCORE_SP_CFG             0x151e
+#define MT6357_BUCK_VCORE_OC_CFG             0x1520
+#define MT6357_BUCK_VCORE_DBG0               0x1522
+#define MT6357_BUCK_VCORE_DBG1               0x1524
+#define MT6357_BUCK_VCORE_DBG2               0x1526
+#define MT6357_BUCK_VCORE_ELR_NUM            0x1528
+#define MT6357_BUCK_VCORE_ELR0               0x152a
+#define MT6357_BUCK_VMODEM_DSN_ID            0x1580
+#define MT6357_BUCK_VMODEM_DSN_REV0          0x1582
+#define MT6357_BUCK_VMODEM_DSN_DBI           0x1584
+#define MT6357_BUCK_VMODEM_DSN_DXI           0x1586
+#define MT6357_BUCK_VMODEM_CON0              0x1588
+#define MT6357_BUCK_VMODEM_CON1              0x158a
+#define MT6357_BUCK_VMODEM_CFG0              0x158c
+#define MT6357_BUCK_VMODEM_CFG1              0x158e
+#define MT6357_BUCK_VMODEM_OP_EN             0x1590
+#define MT6357_BUCK_VMODEM_OP_EN_SET         0x1592
+#define MT6357_BUCK_VMODEM_OP_EN_CLR         0x1594
+#define MT6357_BUCK_VMODEM_OP_CFG            0x1596
+#define MT6357_BUCK_VMODEM_OP_CFG_SET        0x1598
+#define MT6357_BUCK_VMODEM_OP_CFG_CLR        0x159a
+#define MT6357_BUCK_VMODEM_SP_CON            0x159c
+#define MT6357_BUCK_VMODEM_SP_CFG            0x159e
+#define MT6357_BUCK_VMODEM_OC_CFG            0x15a0
+#define MT6357_BUCK_VMODEM_DBG0              0x15a2
+#define MT6357_BUCK_VMODEM_DBG1              0x15a4
+#define MT6357_BUCK_VMODEM_DBG2              0x15a6
+#define MT6357_BUCK_VMODEM_ELR_NUM           0x15a8
+#define MT6357_BUCK_VMODEM_ELR0              0x15aa
+#define MT6357_BUCK_VS1_DSN_ID               0x1600
+#define MT6357_BUCK_VS1_DSN_REV0             0x1602
+#define MT6357_BUCK_VS1_DSN_DBI              0x1604
+#define MT6357_BUCK_VS1_DSN_DXI              0x1606
+#define MT6357_BUCK_VS1_CON0                 0x1608
+#define MT6357_BUCK_VS1_CON1                 0x160a
+#define MT6357_BUCK_VS1_CFG0                 0x160c
+#define MT6357_BUCK_VS1_CFG1                 0x160e
+#define MT6357_BUCK_VS1_OP_EN                0x1610
+#define MT6357_BUCK_VS1_OP_EN_SET            0x1612
+#define MT6357_BUCK_VS1_OP_EN_CLR            0x1614
+#define MT6357_BUCK_VS1_OP_CFG               0x1616
+#define MT6357_BUCK_VS1_OP_CFG_SET           0x1618
+#define MT6357_BUCK_VS1_OP_CFG_CLR           0x161a
+#define MT6357_BUCK_VS1_SP_CON               0x161c
+#define MT6357_BUCK_VS1_SP_CFG               0x161e
+#define MT6357_BUCK_VS1_OC_CFG               0x1620
+#define MT6357_BUCK_VS1_DBG0                 0x1622
+#define MT6357_BUCK_VS1_DBG1                 0x1624
+#define MT6357_BUCK_VS1_DBG2                 0x1626
+#define MT6357_BUCK_VS1_VOTER                0x1628
+#define MT6357_BUCK_VS1_VOTER_SET            0x162a
+#define MT6357_BUCK_VS1_VOTER_CLR            0x162c
+#define MT6357_BUCK_VS1_VOTER_CFG            0x162e
+#define MT6357_BUCK_VS1_ELR_NUM              0x1630
+#define MT6357_BUCK_VS1_ELR0                 0x1632
+#define MT6357_BUCK_VPA_DSN_ID               0x1680
+#define MT6357_BUCK_VPA_DSN_REV0             0x1682
+#define MT6357_BUCK_VPA_DSN_DBI              0x1684
+#define MT6357_BUCK_VPA_DSN_DXI              0x1686
+#define MT6357_BUCK_VPA_CON0                 0x1688
+#define MT6357_BUCK_VPA_CON1                 0x168a
+#define MT6357_BUCK_VPA_CFG0                 0x168c
+#define MT6357_BUCK_VPA_CFG1                 0x168e
+#define MT6357_BUCK_VPA_OC_CFG               0x1690
+#define MT6357_BUCK_VPA_DBG0                 0x1692
+#define MT6357_BUCK_VPA_DBG1                 0x1694
+#define MT6357_BUCK_VPA_DBG2                 0x1696
+#define MT6357_BUCK_VPA_DLC_CON0             0x1698
+#define MT6357_BUCK_VPA_DLC_CON1             0x169a
+#define MT6357_BUCK_VPA_DLC_CON2             0x169c
+#define MT6357_BUCK_VPA_MSFG_CON0            0x169e
+#define MT6357_BUCK_VPA_MSFG_CON1            0x16a0
+#define MT6357_BUCK_VPA_MSFG_RRATE0          0x16a2
+#define MT6357_BUCK_VPA_MSFG_RRATE1          0x16a4
+#define MT6357_BUCK_VPA_MSFG_RRATE2          0x16a6
+#define MT6357_BUCK_VPA_MSFG_RTHD0           0x16a8
+#define MT6357_BUCK_VPA_MSFG_RTHD1           0x16aa
+#define MT6357_BUCK_VPA_MSFG_RTHD2           0x16ac
+#define MT6357_BUCK_VPA_MSFG_FRATE0          0x16ae
+#define MT6357_BUCK_VPA_MSFG_FRATE1          0x16b0
+#define MT6357_BUCK_VPA_MSFG_FRATE2          0x16b2
+#define MT6357_BUCK_VPA_MSFG_FTHD0           0x16b4
+#define MT6357_BUCK_VPA_MSFG_FTHD1           0x16b6
+#define MT6357_BUCK_VPA_MSFG_FTHD2           0x16b8
+#define MT6357_BUCK_ANA_DSN_ID               0x1700
+#define MT6357_BUCK_ANA_DSN_REV0             0x1702
+#define MT6357_BUCK_ANA_DSN_DBI              0x1704
+#define MT6357_BUCK_ANA_DSN_FPI              0x1706
+#define MT6357_SMPS_ANA_CON0                 0x1708
+#define MT6357_SMPS_ANA_CON1                 0x170a
+#define MT6357_SMPS_ANA_CON2                 0x170c
+#define MT6357_VCORE_VPROC_ANA_CON0          0x170e
+#define MT6357_VCORE_VPROC_ANA_CON1          0x1710
+#define MT6357_VCORE_VPROC_ANA_CON2          0x1712
+#define MT6357_VCORE_VPROC_ANA_CON3          0x1714
+#define MT6357_VCORE_VPROC_ANA_CON4          0x1716
+#define MT6357_VCORE_VPROC_ANA_CON5          0x1718
+#define MT6357_VCORE_VPROC_ANA_CON6          0x171a
+#define MT6357_VCORE_VPROC_ANA_CON7          0x171c
+#define MT6357_VCORE_VPROC_ANA_CON8          0x171e
+#define MT6357_VCORE_VPROC_ANA_CON9          0x1720
+#define MT6357_VCORE_VPROC_ANA_CON10         0x1722
+#define MT6357_VCORE_VPROC_ANA_CON11         0x1724
+#define MT6357_VMODEM_ANA_CON0               0x1726
+#define MT6357_VMODEM_ANA_CON1               0x1728
+#define MT6357_VMODEM_ANA_CON2               0x172a
+#define MT6357_VMODEM_ANA_CON3               0x172c
+#define MT6357_VMODEM_ANA_CON4               0x172e
+#define MT6357_VMODEM_ANA_CON5               0x1730
+#define MT6357_VS1_ANA_CON0                  0x1732
+#define MT6357_VS1_ANA_CON1                  0x1734
+#define MT6357_VS1_ANA_CON2                  0x1736
+#define MT6357_VS1_ANA_CON3                  0x1738
+#define MT6357_VS1_ANA_CON4                  0x173a
+#define MT6357_VS1_ANA_CON5                  0x173c
+#define MT6357_VPA_ANA_CON0                  0x173e
+#define MT6357_VPA_ANA_CON1                  0x1740
+#define MT6357_VPA_ANA_CON2                  0x1742
+#define MT6357_VPA_ANA_CON3                  0x1744
+#define MT6357_VPA_ANA_CON4                  0x1746
+#define MT6357_VPA_ANA_CON5                  0x1748
+#define MT6357_BUCK_ANA_ELR_NUM              0x174a
+#define MT6357_SMPS_ELR_0                    0x174c
+#define MT6357_SMPS_ELR_1                    0x174e
+#define MT6357_SMPS_ELR_2                    0x1750
+#define MT6357_SMPS_ELR_3                    0x1752
+#define MT6357_SMPS_ELR_4                    0x1754
+#define MT6357_SMPS_ELR_5                    0x1756
+#define MT6357_VCORE_VPROC_ELR_0             0x1758
+#define MT6357_VCORE_VPROC_ELR_1             0x175a
+#define MT6357_VCORE_VPROC_ELR_2             0x175c
+#define MT6357_VCORE_VPROC_ELR_3             0x175e
+#define MT6357_VCORE_VPROC_ELR_4             0x1760
+#define MT6357_VMODEM_ELR_0                  0x1762
+#define MT6357_VMODEM_ELR_1                  0x1764
+#define MT6357_VMODEM_ELR_2                  0x1766
+#define MT6357_VS1_ELR_0                     0x1768
+#define MT6357_VS1_ELR_1                     0x176a
+#define MT6357_VPA_ELR_0                     0x176c
+#define MT6357_LDO_TOP_ID                    0x1880
+#define MT6357_LDO_TOP_REV0                  0x1882
+#define MT6357_LDO_TOP_DBI                   0x1884
+#define MT6357_LDO_TOP_DXI                   0x1886
+#define MT6357_LDO_TPM0                      0x1888
+#define MT6357_LDO_TPM1                      0x188a
+#define MT6357_LDO_TOP_CLK_DCM_CON0          0x188c
+#define MT6357_LDO_TOP_CLK_VIO28_CON0        0x188e
+#define MT6357_LDO_TOP_CLK_VIO18_CON0        0x1890
+#define MT6357_LDO_TOP_CLK_VAUD28_CON0       0x1892
+#define MT6357_LDO_TOP_CLK_VDRAM_CON0        0x1894
+#define MT6357_LDO_TOP_CLK_VSRAM_PROC_CON0   0x1896
+#define MT6357_LDO_TOP_CLK_VSRAM_OTHERS_CON0 0x1898
+#define MT6357_LDO_TOP_CLK_VAUX18_CON0       0x189a
+#define MT6357_LDO_TOP_CLK_VUSB33_CON0       0x189c
+#define MT6357_LDO_TOP_CLK_VEMC_CON0         0x189e
+#define MT6357_LDO_TOP_CLK_VXO22_CON0        0x18a0
+#define MT6357_LDO_TOP_CLK_VSIM1_CON0        0x18a2
+#define MT6357_LDO_TOP_CLK_VSIM2_CON0        0x18a4
+#define MT6357_LDO_TOP_CLK_VCAMD_CON0        0x18a6
+#define MT6357_LDO_TOP_CLK_VCAMIO_CON0       0x18a8
+#define MT6357_LDO_TOP_CLK_VEFUSE_CON0       0x18aa
+#define MT6357_LDO_TOP_CLK_VCN33_CON0        0x18ac
+#define MT6357_LDO_TOP_CLK_VCN18_CON0        0x18ae
+#define MT6357_LDO_TOP_CLK_VCN28_CON0        0x18b0
+#define MT6357_LDO_TOP_CLK_VIBR_CON0         0x18b2
+#define MT6357_LDO_TOP_CLK_VFE28_CON0        0x18b4
+#define MT6357_LDO_TOP_CLK_VMCH_CON0         0x18b6
+#define MT6357_LDO_TOP_CLK_VMC_CON0          0x18b8
+#define MT6357_LDO_TOP_CLK_VRF18_CON0        0x18ba
+#define MT6357_LDO_TOP_CLK_VLDO28_CON0       0x18bc
+#define MT6357_LDO_TOP_CLK_VRF12_CON0        0x18be
+#define MT6357_LDO_TOP_CLK_VCAMA_CON0        0x18c0
+#define MT6357_LDO_TOP_CLK_TREF_CON0         0x18c2
+#define MT6357_LDO_TOP_INT_CON0              0x18c4
+#define MT6357_LDO_TOP_INT_CON0_SET          0x18c6
+#define MT6357_LDO_TOP_INT_CON0_CLR          0x18c8
+#define MT6357_LDO_TOP_INT_CON1              0x18ca
+#define MT6357_LDO_TOP_INT_CON1_SET          0x18cc
+#define MT6357_LDO_TOP_INT_CON1_CLR          0x18ce
+#define MT6357_LDO_TOP_INT_MASK_CON0         0x18d0
+#define MT6357_LDO_TOP_INT_MASK_CON0_SET     0x18d2
+#define MT6357_LDO_TOP_INT_MASK_CON0_CLR     0x18d4
+#define MT6357_LDO_TOP_INT_MASK_CON1         0x18d6
+#define MT6357_LDO_TOP_INT_MASK_CON1_SET     0x18d8
+#define MT6357_LDO_TOP_INT_MASK_CON1_CLR     0x18da
+#define MT6357_LDO_TOP_INT_STATUS0           0x18dc
+#define MT6357_LDO_TOP_INT_STATUS1           0x18de
+#define MT6357_LDO_TOP_INT_RAW_STATUS0       0x18e0
+#define MT6357_LDO_TOP_INT_RAW_STATUS1       0x18e2
+#define MT6357_LDO_TEST_CON0                 0x18e4
+#define MT6357_LDO_TOP_WDT_CON0              0x18e6
+#define MT6357_LDO_TOP_RSV_CON0              0x18e8
+#define MT6357_LDO_TOP_RSV_CON1              0x18ea
+#define MT6357_LDO_OCFB0                     0x18ec
+#define MT6357_LDO_LP_PROTECTION             0x18ee
+#define MT6357_LDO_DUMMY_LOAD_GATED          0x18f0
+#define MT6357_LDO_GON0_DSN_ID               0x1900
+#define MT6357_LDO_GON0_DSN_REV0             0x1902
+#define MT6357_LDO_GON0_DSN_DBI              0x1904
+#define MT6357_LDO_GON0_DSN_DXI              0x1906
+#define MT6357_LDO_VXO22_CON0                0x1908
+#define MT6357_LDO_VXO22_OP_EN               0x190a
+#define MT6357_LDO_VXO22_OP_EN_SET           0x190c
+#define MT6357_LDO_VXO22_OP_EN_CLR           0x190e
+#define MT6357_LDO_VXO22_OP_CFG              0x1910
+#define MT6357_LDO_VXO22_OP_CFG_SET          0x1912
+#define MT6357_LDO_VXO22_OP_CFG_CLR          0x1914
+#define MT6357_LDO_VXO22_CON1                0x1916
+#define MT6357_LDO_VXO22_CON2                0x1918
+#define MT6357_LDO_VXO22_CON3                0x191a
+#define MT6357_LDO_VAUX18_CON0               0x191c
+#define MT6357_LDO_VAUX18_OP_EN              0x191e
+#define MT6357_LDO_VAUX18_OP_EN_SET          0x1920
+#define MT6357_LDO_VAUX18_OP_EN_CLR          0x1922
+#define MT6357_LDO_VAUX18_OP_CFG             0x1924
+#define MT6357_LDO_VAUX18_OP_CFG_SET         0x1926
+#define MT6357_LDO_VAUX18_OP_CFG_CLR         0x1928
+#define MT6357_LDO_VAUX18_CON1               0x192a
+#define MT6357_LDO_VAUX18_CON2               0x192c
+#define MT6357_LDO_VAUX18_CON3               0x192e
+#define MT6357_LDO_VAUD28_CON0               0x1930
+#define MT6357_LDO_VAUD28_OP_EN              0x1932
+#define MT6357_LDO_VAUD28_OP_EN_SET          0x1934
+#define MT6357_LDO_VAUD28_OP_EN_CLR          0x1936
+#define MT6357_LDO_VAUD28_OP_CFG             0x1938
+#define MT6357_LDO_VAUD28_OP_CFG_SET         0x193a
+#define MT6357_LDO_VAUD28_OP_CFG_CLR         0x193c
+#define MT6357_LDO_VAUD28_CON1               0x193e
+#define MT6357_LDO_VAUD28_CON2               0x1940
+#define MT6357_LDO_VAUD28_CON3               0x1942
+#define MT6357_LDO_VIO28_CON0                0x1944
+#define MT6357_LDO_VIO28_OP_EN               0x1946
+#define MT6357_LDO_VIO28_OP_EN_SET           0x1948
+#define MT6357_LDO_VIO28_OP_EN_CLR           0x194a
+#define MT6357_LDO_VIO28_OP_CFG              0x194c
+#define MT6357_LDO_VIO28_OP_CFG_SET          0x194e
+#define MT6357_LDO_VIO28_OP_CFG_CLR          0x1950
+#define MT6357_LDO_VIO28_CON1                0x1952
+#define MT6357_LDO_VIO28_CON2                0x1954
+#define MT6357_LDO_VIO28_CON3                0x1956
+#define MT6357_LDO_VIO18_CON0                0x1958
+#define MT6357_LDO_VIO18_OP_EN               0x195a
+#define MT6357_LDO_VIO18_OP_EN_SET           0x195c
+#define MT6357_LDO_VIO18_OP_EN_CLR           0x195e
+#define MT6357_LDO_VIO18_OP_CFG              0x1960
+#define MT6357_LDO_VIO18_OP_CFG_SET          0x1962
+#define MT6357_LDO_VIO18_OP_CFG_CLR          0x1964
+#define MT6357_LDO_VIO18_CON1                0x1966
+#define MT6357_LDO_VIO18_CON2                0x1968
+#define MT6357_LDO_VIO18_CON3                0x196a
+#define MT6357_LDO_VDRAM_CON0                0x196c
+#define MT6357_LDO_VDRAM_OP_EN               0x196e
+#define MT6357_LDO_VDRAM_OP_EN_SET           0x1970
+#define MT6357_LDO_VDRAM_OP_EN_CLR           0x1972
+#define MT6357_LDO_VDRAM_OP_CFG              0x1974
+#define MT6357_LDO_VDRAM_OP_CFG_SET          0x1976
+#define MT6357_LDO_VDRAM_OP_CFG_CLR          0x1978
+#define MT6357_LDO_VDRAM_CON1                0x197a
+#define MT6357_LDO_VDRAM_CON2                0x197c
+#define MT6357_LDO_VDRAM_CON3                0x197e
+#define MT6357_LDO_GON1_DSN_ID               0x1980
+#define MT6357_LDO_GON1_DSN_REV0             0x1982
+#define MT6357_LDO_GON1_DSN_DBI              0x1984
+#define MT6357_LDO_GON1_DSN_DXI              0x1986
+#define MT6357_LDO_VEMC_CON0                 0x1988
+#define MT6357_LDO_VEMC_OP_EN                0x198a
+#define MT6357_LDO_VEMC_OP_EN_SET            0x198c
+#define MT6357_LDO_VEMC_OP_EN_CLR            0x198e
+#define MT6357_LDO_VEMC_OP_CFG               0x1990
+#define MT6357_LDO_VEMC_OP_CFG_SET           0x1992
+#define MT6357_LDO_VEMC_OP_CFG_CLR           0x1994
+#define MT6357_LDO_VEMC_CON1                 0x1996
+#define MT6357_LDO_VEMC_CON2                 0x1998
+#define MT6357_LDO_VEMC_CON3                 0x199a
+#define MT6357_LDO_VUSB33_CON0_0             0x199c
+#define MT6357_LDO_VUSB33_OP_EN              0x199e
+#define MT6357_LDO_VUSB33_OP_EN_SET          0x19a0
+#define MT6357_LDO_VUSB33_OP_EN_CLR          0x19a2
+#define MT6357_LDO_VUSB33_OP_CFG             0x19a4
+#define MT6357_LDO_VUSB33_OP_CFG_SET         0x19a6
+#define MT6357_LDO_VUSB33_OP_CFG_CLR         0x19a8
+#define MT6357_LDO_VUSB33_CON0_1             0x19aa
+#define MT6357_LDO_VUSB33_CON1               0x19ac
+#define MT6357_LDO_VUSB33_CON2               0x19ae
+#define MT6357_LDO_VUSB33_CON3               0x19b0
+#define MT6357_LDO_VSRAM_PROC_CON0           0x19b2
+#define MT6357_LDO_VSRAM_PROC_CON2           0x19b4
+#define MT6357_LDO_VSRAM_PROC_CFG0           0x19b6
+#define MT6357_LDO_VSRAM_PROC_CFG1           0x19b8
+#define MT6357_LDO_VSRAM_PROC_OP_EN          0x19ba
+#define MT6357_LDO_VSRAM_PROC_OP_EN_SET      0x19bc
+#define MT6357_LDO_VSRAM_PROC_OP_EN_CLR      0x19be
+#define MT6357_LDO_VSRAM_PROC_OP_CFG         0x19c0
+#define MT6357_LDO_VSRAM_PROC_OP_CFG_SET     0x19c2
+#define MT6357_LDO_VSRAM_PROC_OP_CFG_CLR     0x19c4
+#define MT6357_LDO_VSRAM_PROC_CON3           0x19c6
+#define MT6357_LDO_VSRAM_PROC_CON4           0x19c8
+#define MT6357_LDO_VSRAM_PROC_CON5           0x19ca
+#define MT6357_LDO_VSRAM_PROC_DBG0           0x19cc
+#define MT6357_LDO_VSRAM_PROC_DBG1           0x19ce
+#define MT6357_LDO_VSRAM_OTHERS_CON0         0x19d0
+#define MT6357_LDO_VSRAM_OTHERS_CON2         0x19d2
+#define MT6357_LDO_VSRAM_OTHERS_CFG0         0x19d4
+#define MT6357_LDO_VSRAM_OTHERS_CFG1         0x19d6
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN        0x19d8
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN_SET    0x19da
+#define MT6357_LDO_VSRAM_OTHERS_OP_EN_CLR    0x19dc
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG       0x19de
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG_SET   0x19e0
+#define MT6357_LDO_VSRAM_OTHERS_OP_CFG_CLR   0x19e2
+#define MT6357_LDO_VSRAM_OTHERS_CON3         0x19e4
+#define MT6357_LDO_VSRAM_OTHERS_CON4         0x19e6
+#define MT6357_LDO_VSRAM_OTHERS_CON5         0x19e8
+#define MT6357_LDO_VSRAM_OTHERS_DBG0         0x19ea
+#define MT6357_LDO_VSRAM_OTHERS_DBG1         0x19ec
+#define MT6357_LDO_VSRAM_PROC_SP             0x19ee
+#define MT6357_LDO_VSRAM_OTHERS_SP           0x19f0
+#define MT6357_LDO_VSRAM_PROC_R2R_PDN_DIS    0x19f2
+#define MT6357_LDO_VSRAM_OTHERS_R2R_PDN_DIS  0x19f4
+#define MT6357_LDO_VSRAM_WDT_DBG0            0x19f6
+#define MT6357_LDO_GON1_ELR_NUM              0x19f8
+#define MT6357_LDO_VSRAM_CON0                0x19fa
+#define MT6357_LDO_VSRAM_CON1                0x19fc
+#define MT6357_LDO_VSRAM_CON2                0x19fe
+#define MT6357_LDO_GOFF0_DSN_ID              0x1a00
+#define MT6357_LDO_GOFF0_DSN_REV0            0x1a02
+#define MT6357_LDO_GOFF0_DSN_DBI             0x1a04
+#define MT6357_LDO_GOFF0_DSN_DXI             0x1a06
+#define MT6357_LDO_VFE28_CON0                0x1a08
+#define MT6357_LDO_VFE28_OP_EN               0x1a0a
+#define MT6357_LDO_VFE28_OP_EN_SET           0x1a0c
+#define MT6357_LDO_VFE28_OP_EN_CLR           0x1a0e
+#define MT6357_LDO_VFE28_OP_CFG              0x1a10
+#define MT6357_LDO_VFE28_OP_CFG_SET          0x1a12
+#define MT6357_LDO_VFE28_OP_CFG_CLR          0x1a14
+#define MT6357_LDO_VFE28_CON1                0x1a16
+#define MT6357_LDO_VFE28_CON2                0x1a18
+#define MT6357_LDO_VFE28_CON3                0x1a1a
+#define MT6357_LDO_VRF18_CON0                0x1a1c
+#define MT6357_LDO_VRF18_OP_EN               0x1a1e
+#define MT6357_LDO_VRF18_OP_EN_SET           0x1a20
+#define MT6357_LDO_VRF18_OP_EN_CLR           0x1a22
+#define MT6357_LDO_VRF18_OP_CFG              0x1a24
+#define MT6357_LDO_VRF18_OP_CFG_SET          0x1a26
+#define MT6357_LDO_VRF18_OP_CFG_CLR          0x1a28
+#define MT6357_LDO_VRF18_CON1                0x1a2a
+#define MT6357_LDO_VRF18_CON2                0x1a2c
+#define MT6357_LDO_VRF18_CON3                0x1a2e
+#define MT6357_LDO_VRF12_CON0                0x1a30
+#define MT6357_LDO_VRF12_OP_EN               0x1a32
+#define MT6357_LDO_VRF12_OP_EN_SET           0x1a34
+#define MT6357_LDO_VRF12_OP_EN_CLR           0x1a36
+#define MT6357_LDO_VRF12_OP_CFG              0x1a38
+#define MT6357_LDO_VRF12_OP_CFG_SET          0x1a3a
+#define MT6357_LDO_VRF12_OP_CFG_CLR          0x1a3c
+#define MT6357_LDO_VRF12_CON1                0x1a3e
+#define MT6357_LDO_VRF12_CON2                0x1a40
+#define MT6357_LDO_VRF12_CON3                0x1a42
+#define MT6357_LDO_VEFUSE_CON0               0x1a44
+#define MT6357_LDO_VEFUSE_OP_EN              0x1a46
+#define MT6357_LDO_VEFUSE_OP_EN_SET          0x1a48
+#define MT6357_LDO_VEFUSE_OP_EN_CLR          0x1a4a
+#define MT6357_LDO_VEFUSE_OP_CFG             0x1a4c
+#define MT6357_LDO_VEFUSE_OP_CFG_SET         0x1a4e
+#define MT6357_LDO_VEFUSE_OP_CFG_CLR         0x1a50
+#define MT6357_LDO_VEFUSE_CON1               0x1a52
+#define MT6357_LDO_VEFUSE_CON2               0x1a54
+#define MT6357_LDO_VEFUSE_CON3               0x1a56
+#define MT6357_LDO_VCN18_CON0                0x1a58
+#define MT6357_LDO_VCN18_OP_EN               0x1a5a
+#define MT6357_LDO_VCN18_OP_EN_SET           0x1a5c
+#define MT6357_LDO_VCN18_OP_EN_CLR           0x1a5e
+#define MT6357_LDO_VCN18_OP_CFG              0x1a60
+#define MT6357_LDO_VCN18_OP_CFG_SET          0x1a62
+#define MT6357_LDO_VCN18_OP_CFG_CLR          0x1a64
+#define MT6357_LDO_VCN18_CON1                0x1a66
+#define MT6357_LDO_VCN18_CON2                0x1a68
+#define MT6357_LDO_VCN18_CON3                0x1a6a
+#define MT6357_LDO_VCAMA_CON0                0x1a6c
+#define MT6357_LDO_VCAMA_OP_EN               0x1a6e
+#define MT6357_LDO_VCAMA_OP_EN_SET           0x1a70
+#define MT6357_LDO_VCAMA_OP_EN_CLR           0x1a72
+#define MT6357_LDO_VCAMA_OP_CFG              0x1a74
+#define MT6357_LDO_VCAMA_OP_CFG_SET          0x1a76
+#define MT6357_LDO_VCAMA_OP_CFG_CLR          0x1a78
+#define MT6357_LDO_VCAMA_CON1                0x1a7a
+#define MT6357_LDO_VCAMA_CON2                0x1a7c
+#define MT6357_LDO_VCAMA_CON3                0x1a7e
+#define MT6357_LDO_GOFF1_DSN_ID              0x1a80
+#define MT6357_LDO_GOFF1_DSN_REV0            0x1a82
+#define MT6357_LDO_GOFF1_DSN_DBI             0x1a84
+#define MT6357_LDO_GOFF1_DSN_DXI             0x1a86
+#define MT6357_LDO_VCAMD_CON0                0x1a88
+#define MT6357_LDO_VCAMD_OP_EN               0x1a8a
+#define MT6357_LDO_VCAMD_OP_EN_SET           0x1a8c
+#define MT6357_LDO_VCAMD_OP_EN_CLR           0x1a8e
+#define MT6357_LDO_VCAMD_OP_CFG              0x1a90
+#define MT6357_LDO_VCAMD_OP_CFG_SET          0x1a92
+#define MT6357_LDO_VCAMD_OP_CFG_CLR          0x1a94
+#define MT6357_LDO_VCAMD_CON1                0x1a96
+#define MT6357_LDO_VCAMD_CON2                0x1a98
+#define MT6357_LDO_VCAMD_CON3                0x1a9a
+#define MT6357_LDO_VCAMIO_CON0               0x1a9c
+#define MT6357_LDO_VCAMIO_OP_EN              0x1a9e
+#define MT6357_LDO_VCAMIO_OP_EN_SET          0x1aa0
+#define MT6357_LDO_VCAMIO_OP_EN_CLR          0x1aa2
+#define MT6357_LDO_VCAMIO_OP_CFG             0x1aa4
+#define MT6357_LDO_VCAMIO_OP_CFG_SET         0x1aa6
+#define MT6357_LDO_VCAMIO_OP_CFG_CLR         0x1aa8
+#define MT6357_LDO_VCAMIO_CON1               0x1aaa
+#define MT6357_LDO_VCAMIO_CON2               0x1aac
+#define MT6357_LDO_VCAMIO_CON3               0x1aae
+#define MT6357_LDO_VMC_CON0                  0x1ab0
+#define MT6357_LDO_VMC_OP_EN                 0x1ab2
+#define MT6357_LDO_VMC_OP_EN_SET             0x1ab4
+#define MT6357_LDO_VMC_OP_EN_CLR             0x1ab6
+#define MT6357_LDO_VMC_OP_CFG                0x1ab8
+#define MT6357_LDO_VMC_OP_CFG_SET            0x1aba
+#define MT6357_LDO_VMC_OP_CFG_CLR            0x1abc
+#define MT6357_LDO_VMC_CON1                  0x1abe
+#define MT6357_LDO_VMC_CON2                  0x1ac0
+#define MT6357_LDO_VMC_CON3                  0x1ac2
+#define MT6357_LDO_VMCH_CON0                 0x1ac4
+#define MT6357_LDO_VMCH_OP_EN                0x1ac6
+#define MT6357_LDO_VMCH_OP_EN_SET            0x1ac8
+#define MT6357_LDO_VMCH_OP_EN_CLR            0x1aca
+#define MT6357_LDO_VMCH_OP_CFG               0x1acc
+#define MT6357_LDO_VMCH_OP_CFG_SET           0x1ace
+#define MT6357_LDO_VMCH_OP_CFG_CLR           0x1ad0
+#define MT6357_LDO_VMCH_CON1                 0x1ad2
+#define MT6357_LDO_VMCH_CON2                 0x1ad4
+#define MT6357_LDO_VMCH_CON3                 0x1ad6
+#define MT6357_LDO_VSIM1_CON0                0x1ad8
+#define MT6357_LDO_VSIM1_OP_EN               0x1ada
+#define MT6357_LDO_VSIM1_OP_EN_SET           0x1adc
+#define MT6357_LDO_VSIM1_OP_EN_CLR           0x1ade
+#define MT6357_LDO_VSIM1_OP_CFG              0x1ae0
+#define MT6357_LDO_VSIM1_OP_CFG_SET          0x1ae2
+#define MT6357_LDO_VSIM1_OP_CFG_CLR          0x1ae4
+#define MT6357_LDO_VSIM1_CON1                0x1ae6
+#define MT6357_LDO_VSIM1_CON2                0x1ae8
+#define MT6357_LDO_VSIM1_CON3                0x1aea
+#define MT6357_LDO_VSIM2_CON0                0x1aec
+#define MT6357_LDO_VSIM2_OP_EN               0x1aee
+#define MT6357_LDO_VSIM2_OP_EN_SET           0x1af0
+#define MT6357_LDO_VSIM2_OP_EN_CLR           0x1af2
+#define MT6357_LDO_VSIM2_OP_CFG              0x1af4
+#define MT6357_LDO_VSIM2_OP_CFG_SET          0x1af6
+#define MT6357_LDO_VSIM2_OP_CFG_CLR          0x1af8
+#define MT6357_LDO_VSIM2_CON1                0x1afa
+#define MT6357_LDO_VSIM2_CON2                0x1afc
+#define MT6357_LDO_VSIM2_CON3                0x1afe
+#define MT6357_LDO_GOFF2_DSN_ID              0x1b00
+#define MT6357_LDO_GOFF2_DSN_REV0            0x1b02
+#define MT6357_LDO_GOFF2_DSN_DBI             0x1b04
+#define MT6357_LDO_GOFF2_DSN_DXI             0x1b06
+#define MT6357_LDO_VIBR_CON0                 0x1b08
+#define MT6357_LDO_VIBR_OP_EN                0x1b0a
+#define MT6357_LDO_VIBR_OP_EN_SET            0x1b0c
+#define MT6357_LDO_VIBR_OP_EN_CLR            0x1b0e
+#define MT6357_LDO_VIBR_OP_CFG               0x1b10
+#define MT6357_LDO_VIBR_OP_CFG_SET           0x1b12
+#define MT6357_LDO_VIBR_OP_CFG_CLR           0x1b14
+#define MT6357_LDO_VIBR_CON1                 0x1b16
+#define MT6357_LDO_VIBR_CON2                 0x1b18
+#define MT6357_LDO_VIBR_CON3                 0x1b1a
+#define MT6357_LDO_VCN33_CON0_0              0x1b1c
+#define MT6357_LDO_VCN33_OP_EN               0x1b1e
+#define MT6357_LDO_VCN33_OP_EN_SET           0x1b20
+#define MT6357_LDO_VCN33_OP_EN_CLR           0x1b22
+#define MT6357_LDO_VCN33_OP_CFG              0x1b24
+#define MT6357_LDO_VCN33_OP_CFG_SET          0x1b26
+#define MT6357_LDO_VCN33_OP_CFG_CLR          0x1b28
+#define MT6357_LDO_VCN33_CON0_1              0x1b2a
+#define MT6357_LDO_VCN33_CON1                0x1b2c
+#define MT6357_LDO_VCN33_CON2                0x1b2e
+#define MT6357_LDO_VCN33_CON3                0x1b30
+#define MT6357_LDO_VLDO28_CON0_0             0x1b32
+#define MT6357_LDO_VLDO28_OP_EN              0x1b34
+#define MT6357_LDO_VLDO28_OP_EN_SET          0x1b36
+#define MT6357_LDO_VLDO28_OP_EN_CLR          0x1b38
+#define MT6357_LDO_VLDO28_OP_CFG             0x1b3a
+#define MT6357_LDO_VLDO28_OP_CFG_SET         0x1b3c
+#define MT6357_LDO_VLDO28_OP_CFG_CLR         0x1b3e
+#define MT6357_LDO_VLDO28_CON0_1             0x1b40
+#define MT6357_LDO_VLDO28_CON1               0x1b42
+#define MT6357_LDO_VLDO28_CON2               0x1b44
+#define MT6357_LDO_VLDO28_CON3               0x1b46
+#define MT6357_LDO_GOFF2_RSV_CON0            0x1b48
+#define MT6357_LDO_GOFF2_RSV_CON1            0x1b4a
+#define MT6357_LDO_GOFF3_DSN_ID              0x1b80
+#define MT6357_LDO_GOFF3_DSN_REV0            0x1b82
+#define MT6357_LDO_GOFF3_DSN_DBI             0x1b84
+#define MT6357_LDO_GOFF3_DSN_DXI             0x1b86
+#define MT6357_LDO_VCN28_CON0                0x1b88
+#define MT6357_LDO_VCN28_OP_EN               0x1b8a
+#define MT6357_LDO_VCN28_OP_EN_SET           0x1b8c
+#define MT6357_LDO_VCN28_OP_EN_CLR           0x1b8e
+#define MT6357_LDO_VCN28_OP_CFG              0x1b90
+#define MT6357_LDO_VCN28_OP_CFG_SET          0x1b92
+#define MT6357_LDO_VCN28_OP_CFG_CLR          0x1b94
+#define MT6357_LDO_VCN28_CON1                0x1b96
+#define MT6357_LDO_VCN28_CON2                0x1b98
+#define MT6357_LDO_VCN28_CON3                0x1b9a
+#define MT6357_VRTC_CON0                     0x1b9c
+#define MT6357_LDO_TREF_CON0                 0x1b9e
+#define MT6357_LDO_TREF_OP_EN                0x1ba0
+#define MT6357_LDO_TREF_OP_EN_SET            0x1ba2
+#define MT6357_LDO_TREF_OP_EN_CLR            0x1ba4
+#define MT6357_LDO_TREF_OP_CFG               0x1ba6
+#define MT6357_LDO_TREF_OP_CFG_SET           0x1ba8
+#define MT6357_LDO_TREF_OP_CFG_CLR           0x1baa
+#define MT6357_LDO_TREF_CON1                 0x1bac
+#define MT6357_LDO_GOFF3_RSV_CON0            0x1bae
+#define MT6357_LDO_GOFF3_RSV_CON1            0x1bb0
+#define MT6357_LDO_ANA0_DSN_ID               0x1c00
+#define MT6357_LDO_ANA0_DSN_REV0             0x1c02
+#define MT6357_LDO_ANA0_DSN_DBI              0x1c04
+#define MT6357_LDO_ANA0_DSN_DXI              0x1c06
+#define MT6357_VFE28_ANA_CON0                0x1c08
+#define MT6357_VFE28_ANA_CON1                0x1c0a
+#define MT6357_VCN28_ANA_CON0                0x1c0c
+#define MT6357_VCN28_ANA_CON1                0x1c0e
+#define MT6357_VAUD28_ANA_CON0               0x1c10
+#define MT6357_VAUD28_ANA_CON1               0x1c12
+#define MT6357_VAUX18_ANA_CON0               0x1c14
+#define MT6357_VAUX18_ANA_CON1               0x1c16
+#define MT6357_VXO22_ANA_CON0                0x1c18
+#define MT6357_VXO22_ANA_CON1                0x1c1a
+#define MT6357_VCN33_ANA_CON0                0x1c1c
+#define MT6357_VCN33_ANA_CON1                0x1c1e
+#define MT6357_VEMC_ANA_CON0                 0x1c20
+#define MT6357_VEMC_ANA_CON1                 0x1c22
+#define MT6357_VLDO28_ANA_CON0               0x1c24
+#define MT6357_VLDO28_ANA_CON1               0x1c26
+#define MT6357_VIO28_ANA_CON0                0x1c28
+#define MT6357_VIO28_ANA_CON1                0x1c2a
+#define MT6357_VIBR_ANA_CON0                 0x1c2c
+#define MT6357_VIBR_ANA_CON1                 0x1c2e
+#define MT6357_VSIM1_ANA_CON0                0x1c30
+#define MT6357_VSIM1_ANA_CON1                0x1c32
+#define MT6357_VSIM2_ANA_CON0                0x1c34
+#define MT6357_VSIM2_ANA_CON1                0x1c36
+#define MT6357_VMCH_ANA_CON0                 0x1c38
+#define MT6357_VMCH_ANA_CON1                 0x1c3a
+#define MT6357_VMC_ANA_CON0                  0x1c3c
+#define MT6357_VMC_ANA_CON1                  0x1c3e
+#define MT6357_VCAMIO_ANA_CON0               0x1c40
+#define MT6357_VCAMIO_ANA_CON1               0x1c42
+#define MT6357_VCN18_ANA_CON0                0x1c44
+#define MT6357_VCN18_ANA_CON1                0x1c46
+#define MT6357_VRF18_ANA_CON0                0x1c48
+#define MT6357_VRF18_ANA_CON1                0x1c4a
+#define MT6357_VIO18_ANA_CON0                0x1c4c
+#define MT6357_VIO18_ANA_CON1                0x1c4e
+#define MT6357_VDRAM_ANA_CON1                0x1c50
+#define MT6357_VRF12_ANA_CON0                0x1c52
+#define MT6357_VRF12_ANA_CON1                0x1c54
+#define MT6357_VSRAM_PROC_ANA_CON0           0x1c56
+#define MT6357_VSRAM_OTHERS_ANA_CON0         0x1c58
+#define MT6357_LDO_ANA0_ELR_NUM              0x1c5a
+#define MT6357_VFE28_ELR_0                   0x1c5c
+#define MT6357_VCN28_ELR_0                   0x1c5e
+#define MT6357_VAUD28_ELR_0                  0x1c60
+#define MT6357_VAUX18_ELR_0                  0x1c62
+#define MT6357_VXO22_ELR_0                   0x1c64
+#define MT6357_VCN33_ELR_0                   0x1c66
+#define MT6357_VEMC_ELR_0                    0x1c68
+#define MT6357_VLDO28_ELR_0                  0x1c6a
+#define MT6357_VIO28_ELR_0                   0x1c6c
+#define MT6357_VIBR_ELR_0                    0x1c6e
+#define MT6357_VSIM1_ELR_0                   0x1c70
+#define MT6357_VSIM2_ELR_0                   0x1c72
+#define MT6357_VMCH_ELR_0                    0x1c74
+#define MT6357_VMC_ELR_0                     0x1c76
+#define MT6357_VCAMIO_ELR_0                  0x1c78
+#define MT6357_VCN18_ELR_0                   0x1c7a
+#define MT6357_VRF18_ELR_0                   0x1c7c
+#define MT6357_LDO_ANA1_DSN_ID               0x1c80
+#define MT6357_LDO_ANA1_DSN_REV0             0x1c82
+#define MT6357_LDO_ANA1_DSN_DBI              0x1c84
+#define MT6357_LDO_ANA1_DSN_DXI              0x1c86
+#define MT6357_VUSB33_ANA_CON0               0x1c88
+#define MT6357_VUSB33_ANA_CON1               0x1c8a
+#define MT6357_VCAMA_ANA_CON0                0x1c8c
+#define MT6357_VCAMA_ANA_CON1                0x1c8e
+#define MT6357_VEFUSE_ANA_CON0               0x1c90
+#define MT6357_VEFUSE_ANA_CON1               0x1c92
+#define MT6357_VCAMD_ANA_CON0                0x1c94
+#define MT6357_VCAMD_ANA_CON1                0x1c96
+#define MT6357_LDO_ANA1_ELR_NUM              0x1c98
+#define MT6357_VUSB33_ELR_0                  0x1c9a
+#define MT6357_VCAMA_ELR_0                   0x1c9c
+#define MT6357_VEFUSE_ELR_0                  0x1c9e
+#define MT6357_VCAMD_ELR_0                   0x1ca0
+#define MT6357_VIO18_ELR_0                   0x1ca2
+#define MT6357_VDRAM_ELR_0                   0x1ca4
+#define MT6357_VRF12_ELR_0                   0x1ca6
+#define MT6357_VRTC_ELR_0                    0x1ca8
+#define MT6357_VDRAM_ELR_1                   0x1caa
+#define MT6357_VDRAM_ELR_2                   0x1cac
+#define MT6357_XPP_TOP_ID                    0x1e00
+#define MT6357_XPP_TOP_REV0                  0x1e02
+#define MT6357_XPP_TOP_DBI                   0x1e04
+#define MT6357_XPP_TOP_DXI                   0x1e06
+#define MT6357_XPP_TPM0                      0x1e08
+#define MT6357_XPP_TPM1                      0x1e0a
+#define MT6357_XPP_TOP_TEST_OUT              0x1e0c
+#define MT6357_XPP_TOP_TEST_CON0             0x1e0e
+#define MT6357_XPP_TOP_CKPDN_CON0            0x1e10
+#define MT6357_XPP_TOP_CKPDN_CON0_SET        0x1e12
+#define MT6357_XPP_TOP_CKPDN_CON0_CLR        0x1e14
+#define MT6357_XPP_TOP_CKSEL_CON0            0x1e16
+#define MT6357_XPP_TOP_CKSEL_CON0_SET        0x1e18
+#define MT6357_XPP_TOP_CKSEL_CON0_CLR        0x1e1a
+#define MT6357_XPP_TOP_RST_CON0              0x1e1c
+#define MT6357_XPP_TOP_RST_CON0_SET          0x1e1e
+#define MT6357_XPP_TOP_RST_CON0_CLR          0x1e20
+#define MT6357_XPP_TOP_RST_BANK_CON0         0x1e22
+#define MT6357_XPP_TOP_RST_BANK_CON0_SET     0x1e24
+#define MT6357_XPP_TOP_RST_BANK_CON0_CLR     0x1e26
+#define MT6357_DRIVER_BL_DSN_ID              0x1e80
+#define MT6357_DRIVER_BL_DSN_REV0            0x1e82
+#define MT6357_DRIVER_BL_DSN_DBI             0x1e84
+#define MT6357_DRIVER_BL_DSN_DXI             0x1e86
+#define MT6357_ISINK1_CON0                   0x1e88
+#define MT6357_ISINK1_CON1                   0x1e8a
+#define MT6357_ISINK1_CON2                   0x1e8c
+#define MT6357_ISINK1_CON3                   0x1e8e
+#define MT6357_ISINK_ANA1                    0x1e90
+#define MT6357_ISINK_PHASE_DLY               0x1e92
+#define MT6357_ISINK_SFSTR                   0x1e94
+#define MT6357_ISINK_EN_CTRL                 0x1e96
+#define MT6357_ISINK_MODE_CTRL               0x1e98
+#define MT6357_DRIVER_ANA_CON0               0x1e9a
+#define MT6357_ISINK_ANA_CON0                0x1e9c
+#define MT6357_ISINK_ANA_CON1                0x1e9e
+#define MT6357_DRIVER_BL_ELR_NUM             0x1ea0
+#define MT6357_DRIVER_BL_ELR_0               0x1ea2
+#define MT6357_DRIVER_CI_DSN_ID              0x1f00
+#define MT6357_DRIVER_CI_DSN_REV0            0x1f02
+#define MT6357_DRIVER_CI_DSN_DBI             0x1f04
+#define MT6357_DRIVER_CI_DSN_DXI             0x1f06
+#define MT6357_CHRIND_CON0                   0x1f08
+#define MT6357_CHRIND_CON1                   0x1f0a
+#define MT6357_CHRIND_CON2                   0x1f0c
+#define MT6357_CHRIND_CON3                   0x1f0e
+#define MT6357_CHRIND_CON4                   0x1f10
+#define MT6357_CHRIND_EN_CTRL                0x1f12
+#define MT6357_CHRIND_ANA_CON0               0x1f14
+#define MT6357_DRIVER_DL_DSN_ID              0x1f80
+#define MT6357_DRIVER_DL_DSN_REV0            0x1f82
+#define MT6357_DRIVER_DL_DSN_DBI             0x1f84
+#define MT6357_DRIVER_DL_DSN_DXI             0x1f86
+#define MT6357_ISINK2_CON0                   0x1f88
+#define MT6357_ISINK3_CON0                   0x1f8a
+#define MT6357_ISINK_EN_CTRL_SMPL            0x1f8c
+#define MT6357_AUD_TOP_ID                    0x2080
+#define MT6357_AUD_TOP_REV0                  0x2082
+#define MT6357_AUD_TOP_DBI                   0x2084
+#define MT6357_AUD_TOP_DXI                   0x2086
+#define MT6357_AUD_TOP_CKPDN_TPM0            0x2088
+#define MT6357_AUD_TOP_CKPDN_TPM1            0x208a
+#define MT6357_AUD_TOP_CKPDN_CON0            0x208c
+#define MT6357_AUD_TOP_CKPDN_CON0_SET        0x208e
+#define MT6357_AUD_TOP_CKPDN_CON0_CLR        0x2090
+#define MT6357_AUD_TOP_CKSEL_CON0            0x2092
+#define MT6357_AUD_TOP_CKSEL_CON0_SET        0x2094
+#define MT6357_AUD_TOP_CKSEL_CON0_CLR        0x2096
+#define MT6357_AUD_TOP_CKTST_CON0            0x2098
+#define MT6357_AUD_TOP_RST_CON0              0x209a
+#define MT6357_AUD_TOP_RST_CON0_SET          0x209c
+#define MT6357_AUD_TOP_RST_CON0_CLR          0x209e
+#define MT6357_AUD_TOP_RST_BANK_CON0         0x20a0
+#define MT6357_AUD_TOP_INT_CON0              0x20a2
+#define MT6357_AUD_TOP_INT_CON0_SET          0x20a4
+#define MT6357_AUD_TOP_INT_CON0_CLR          0x20a6
+#define MT6357_AUD_TOP_INT_MASK_CON0         0x20a8
+#define MT6357_AUD_TOP_INT_MASK_CON0_SET     0x20aa
+#define MT6357_AUD_TOP_INT_MASK_CON0_CLR     0x20ac
+#define MT6357_AUD_TOP_INT_STATUS0           0x20ae
+#define MT6357_AUD_TOP_INT_RAW_STATUS0       0x20b0
+#define MT6357_AUD_TOP_INT_MISC_CON0         0x20b2
+#define MT6357_AUDNCP_CLKDIV_CON0            0x20b4
+#define MT6357_AUDNCP_CLKDIV_CON1            0x20b6
+#define MT6357_AUDNCP_CLKDIV_CON2            0x20b8
+#define MT6357_AUDNCP_CLKDIV_CON3            0x20ba
+#define MT6357_AUDNCP_CLKDIV_CON4            0x20bc
+#define MT6357_AUD_TOP_MON_CON0              0x20be
+#define MT6357_AUDIO_DIG_DSN_ID              0x2100
+#define MT6357_AUDIO_DIG_DSN_REV0            0x2102
+#define MT6357_AUDIO_DIG_DSN_DBI             0x2104
+#define MT6357_AUDIO_DIG_DSN_DXI             0x2106
+#define MT6357_AFE_UL_DL_CON0                0x2108
+#define MT6357_AFE_DL_SRC2_CON0_L            0x210a
+#define MT6357_AFE_UL_SRC_CON0_H             0x210c
+#define MT6357_AFE_UL_SRC_CON0_L             0x210e
+#define MT6357_AFE_TOP_CON0                  0x2110
+#define MT6357_AUDIO_TOP_CON0                0x2112
+#define MT6357_AFE_MON_DEBUG0                0x2114
+#define MT6357_AFUNC_AUD_CON0                0x2116
+#define MT6357_AFUNC_AUD_CON1                0x2118
+#define MT6357_AFUNC_AUD_CON2                0x211a
+#define MT6357_AFUNC_AUD_CON3                0x211c
+#define MT6357_AFUNC_AUD_CON4                0x211e
+#define MT6357_AFUNC_AUD_CON5                0x2120
+#define MT6357_AFUNC_AUD_CON6                0x2122
+#define MT6357_AFUNC_AUD_MON0                0x2124
+#define MT6357_AUDRC_TUNE_MON0               0x2126
+#define MT6357_AFE_ADDA_MTKAIF_FIFO_CFG0     0x2128
+#define MT6357_AFE_ADDA_MTKAIF_FIFO_LOG_MON1 0x212a
+#define MT6357_AFE_ADDA_MTKAIF_MON0          0x212c
+#define MT6357_AFE_ADDA_MTKAIF_MON1          0x212e
+#define MT6357_AFE_ADDA_MTKAIF_MON2          0x2130
+#define MT6357_AFE_ADDA_MTKAIF_MON3          0x2132
+#define MT6357_AFE_ADDA_MTKAIF_CFG0          0x2134
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG0       0x2136
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG1       0x2138
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG2       0x213a
+#define MT6357_AFE_ADDA_MTKAIF_RX_CFG3       0x213c
+#define MT6357_AFE_ADDA_MTKAIF_TX_CFG1       0x213e
+#define MT6357_AFE_SGEN_CFG0                 0x2140
+#define MT6357_AFE_SGEN_CFG1                 0x2142
+#define MT6357_AFE_ADC_ASYNC_FIFO_CFG        0x2144
+#define MT6357_AFE_DCCLK_CFG0                0x2146
+#define MT6357_AFE_DCCLK_CFG1                0x2148
+#define MT6357_AUDIO_DIG_CFG                 0x214a
+#define MT6357_AFE_AUD_PAD_TOP               0x214c
+#define MT6357_AFE_AUD_PAD_TOP_MON           0x214e
+#define MT6357_AFE_AUD_PAD_TOP_MON1          0x2150
+#define MT6357_AUDENC_DSN_ID                 0x2180
+#define MT6357_AUDENC_DSN_REV0               0x2182
+#define MT6357_AUDENC_DSN_DBI                0x2184
+#define MT6357_AUDENC_DSN_FPI                0x2186
+#define MT6357_AUDENC_ANA_CON0               0x2188
+#define MT6357_AUDENC_ANA_CON1               0x218a
+#define MT6357_AUDENC_ANA_CON2               0x218c
+#define MT6357_AUDENC_ANA_CON3               0x218e
+#define MT6357_AUDENC_ANA_CON4               0x2190
+#define MT6357_AUDENC_ANA_CON5               0x2192
+#define MT6357_AUDENC_ANA_CON6               0x2194
+#define MT6357_AUDENC_ANA_CON7               0x2196
+#define MT6357_AUDENC_ANA_CON8               0x2198
+#define MT6357_AUDENC_ANA_CON9               0x219a
+#define MT6357_AUDENC_ANA_CON10              0x219c
+#define MT6357_AUDENC_ANA_CON11              0x219e
+#define MT6357_AUDDEC_DSN_ID                 0x2200
+#define MT6357_AUDDEC_DSN_REV0               0x2202
+#define MT6357_AUDDEC_DSN_DBI                0x2204
+#define MT6357_AUDDEC_DSN_FPI                0x2206
+#define MT6357_AUDDEC_ANA_CON0               0x2208
+#define MT6357_AUDDEC_ANA_CON1               0x220a
+#define MT6357_AUDDEC_ANA_CON2               0x220c
+#define MT6357_AUDDEC_ANA_CON3               0x220e
+#define MT6357_AUDDEC_ANA_CON4               0x2210
+#define MT6357_AUDDEC_ANA_CON5               0x2212
+#define MT6357_AUDDEC_ANA_CON6               0x2214
+#define MT6357_AUDDEC_ANA_CON7               0x2216
+#define MT6357_AUDDEC_ANA_CON8               0x2218
+#define MT6357_AUDDEC_ANA_CON9               0x221a
+#define MT6357_AUDDEC_ANA_CON10              0x221c
+#define MT6357_AUDDEC_ANA_CON11              0x221e
+#define MT6357_AUDDEC_ANA_CON12              0x2220
+#define MT6357_AUDDEC_ANA_CON13              0x2222
+#define MT6357_AUDDEC_ELR_NUM                0x2224
+#define MT6357_AUDDEC_ELR_0                  0x2226
+#define MT6357_AUDZCD_DSN_ID                 0x2280
+#define MT6357_AUDZCD_DSN_REV0               0x2282
+#define MT6357_AUDZCD_DSN_DBI                0x2284
+#define MT6357_AUDZCD_DSN_FPI                0x2286
+#define MT6357_ZCD_CON0                      0x2288
+#define MT6357_ZCD_CON1                      0x228a
+#define MT6357_ZCD_CON2                      0x228c
+#define MT6357_ZCD_CON3                      0x228e
+#define MT6357_ZCD_CON4                      0x2290
+#define MT6357_ZCD_CON5                      0x2292
+#define MT6357_ACCDET_DSN_DIG_ID             0x2300
+#define MT6357_ACCDET_DSN_DIG_REV0           0x2302
+#define MT6357_ACCDET_DSN_DBI                0x2304
+#define MT6357_ACCDET_DSN_FPI                0x2306
+#define MT6357_ACCDET_CON0                   0x2308
+#define MT6357_ACCDET_CON1                   0x230a
+#define MT6357_ACCDET_CON2                   0x230c
+#define MT6357_ACCDET_CON3                   0x230e
+#define MT6357_ACCDET_CON4                   0x2310
+#define MT6357_ACCDET_CON5                   0x2312
+#define MT6357_ACCDET_CON6                   0x2314
+#define MT6357_ACCDET_CON7                   0x2316
+#define MT6357_ACCDET_CON8                   0x2318
+#define MT6357_ACCDET_CON9                   0x231a
+#define MT6357_ACCDET_CON10                  0x231c
+#define MT6357_ACCDET_CON11                  0x231e
+#define MT6357_ACCDET_CON12                  0x2320
+#define MT6357_ACCDET_CON13                  0x2322
+#define MT6357_ACCDET_CON14                  0x2324
+#define MT6357_ACCDET_CON15                  0x2326
+#define MT6357_ACCDET_CON16                  0x2328
+#define MT6357_ACCDET_CON17                  0x232a
+#define MT6357_ACCDET_CON18                  0x232c
+#define MT6357_ACCDET_CON19                  0x232e
+#define MT6357_ACCDET_CON20                  0x2330
+#define MT6357_ACCDET_CON21                  0x2332
+#define MT6357_ACCDET_CON22                  0x2334
+#define MT6357_ACCDET_CON23                  0x2336
+#define MT6357_ACCDET_CON24                  0x2338
+#define MT6357_ACCDET_CON25                  0x233a
+#define MT6357_ACCDET_CON26                  0x233c
+#define MT6357_ACCDET_CON27                  0x233e
+#define MT6357_ACCDET_CON28                  0x2340
+
+#endif /* __MFD_MT6357_REGISTERS_H__ */
-- 
cgit v1.2.3


From 738654be3cf7af4a0a131bd92142db045f0660dc Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Tue, 31 May 2022 14:49:57 +0200
Subject: mfd: mt6358-irq: Add MT6357 PMIC support

Add MT6357 PMIC IRQ support.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220531124959.202787-6-fparent@baylibre.com
---
 drivers/mfd/mt6358-irq.c        | 24 ++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6358-irq.c b/drivers/mfd/mt6358-irq.c
index ea5e452510eb..389756436af6 100644
--- a/drivers/mfd/mt6358-irq.c
+++ b/drivers/mfd/mt6358-irq.c
@@ -3,6 +3,8 @@
 // Copyright (c) 2020 MediaTek Inc.
 
 #include <linux/interrupt.h>
+#include <linux/mfd/mt6357/core.h>
+#include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/core.h>
@@ -17,6 +19,17 @@
 
 #define MTK_PMIC_REG_WIDTH 16
 
+static const struct irq_top_t mt6357_ints[] = {
+	MT6357_TOP_GEN(BUCK),
+	MT6357_TOP_GEN(LDO),
+	MT6357_TOP_GEN(PSC),
+	MT6357_TOP_GEN(SCK),
+	MT6357_TOP_GEN(BM),
+	MT6357_TOP_GEN(HK),
+	MT6357_TOP_GEN(AUD),
+	MT6357_TOP_GEN(MISC),
+};
+
 static const struct irq_top_t mt6358_ints[] = {
 	MT6358_TOP_GEN(BUCK),
 	MT6358_TOP_GEN(LDO),
@@ -39,6 +52,13 @@ static const struct irq_top_t mt6359_ints[] = {
 	MT6359_TOP_GEN(MISC),
 };
 
+static struct pmic_irq_data mt6357_irqd = {
+	.num_top = ARRAY_SIZE(mt6357_ints),
+	.num_pmic_irqs = MT6357_IRQ_NR,
+	.top_int_status_reg = MT6357_TOP_INT_STATUS0,
+	.pmic_ints = mt6357_ints,
+};
+
 static struct pmic_irq_data mt6358_irqd = {
 	.num_top = ARRAY_SIZE(mt6358_ints),
 	.num_pmic_irqs = MT6358_IRQ_NR,
@@ -211,6 +231,10 @@ int mt6358_irq_init(struct mt6397_chip *chip)
 	struct pmic_irq_data *irqd;
 
 	switch (chip->chip_id) {
+	case MT6357_CHIP_ID:
+		chip->irq_data = &mt6357_irqd;
+		break;
+
 	case MT6358_CHIP_ID:
 	case MT6366_CHIP_ID:
 		chip->irq_data = &mt6358_irqd;
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 1cf78726503b..3fecaffe5019 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -12,6 +12,7 @@
 
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
+	MT6357_CHIP_ID = 0x57,
 	MT6358_CHIP_ID = 0x58,
 	MT6359_CHIP_ID = 0x59,
 	MT6366_CHIP_ID = 0x66,
-- 
cgit v1.2.3


From 66ee379d743c69c726b61d078119a34d5be96a35 Mon Sep 17 00:00:00 2001
From: Tinghan Shen <tinghan.shen@mediatek.com>
Date: Wed, 1 Jun 2022 19:22:01 +0800
Subject: mfd: cros_ec: Add SCP Core-1 as a new CrOS EC MCU

MT8195 System Companion Processors(SCP) is a dual-core RISC-V MCU.
Add a new CrOS feature ID to represent the SCP's 2nd core.

The 1st core is referred to as 'core 0', and the 2nd core is referred
to as 'core 1'.

Signed-off-by: Tinghan Shen <tinghan.shen@mediatek.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220601112201.15510-16-tinghan.shen@mediatek.com
---
 drivers/mfd/cros_ec_dev.c                      | 5 +++++
 include/linux/platform_data/cros_ec_commands.h | 2 ++
 include/linux/platform_data/cros_ec_proto.h    | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index 596731caf407..07cc31d92edc 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -64,6 +64,11 @@ static const struct cros_feature_to_name cros_mcu_devices[] = {
 		.name	= CROS_EC_DEV_SCP_NAME,
 		.desc	= "System Control Processor",
 	},
+	{
+		.id	= EC_FEATURE_SCP_C1,
+		.name	= CROS_EC_DEV_SCP_C1_NAME,
+		.desc	= "System Control Processor 2nd Core",
+	},
 	{
 		.id	= EC_FEATURE_TOUCHPAD,
 		.name	= CROS_EC_DEV_TP_NAME,
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 8cfa8cfca77e..9fbf1c5eb8d3 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1300,6 +1300,8 @@ enum ec_feature_code {
 	 * mux.
 	 */
 	EC_FEATURE_TYPEC_MUX_REQUIRE_AP_ACK = 43,
+	/* The MCU is a System Companion Processor (SCP) 2nd Core. */
+	EC_FEATURE_SCP_C1 = 45,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h
index 138fd912c808..da06dc7cf1cb 100644
--- a/include/linux/platform_data/cros_ec_proto.h
+++ b/include/linux/platform_data/cros_ec_proto.h
@@ -19,6 +19,7 @@
 #define CROS_EC_DEV_ISH_NAME	"cros_ish"
 #define CROS_EC_DEV_PD_NAME	"cros_pd"
 #define CROS_EC_DEV_SCP_NAME	"cros_scp"
+#define CROS_EC_DEV_SCP_C1_NAME	"cros_scp_c1"
 #define CROS_EC_DEV_TP_NAME	"cros_tp"
 
 /*
-- 
cgit v1.2.3


From 4a346a03a63cb45f7766d9d6559cf3fee783e926 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 14 Jun 2022 17:21:48 +0200
Subject: mfd: twl: Remove platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no in-tree machine that provides a struct twl4030_platform_data
since commit e92fc4f04a34 ("ARM: OMAP2+: Drop legacy board file for
LDP"). So assume dev_get_platdata() returns NULL in twl_probe() and
simplify accordingly.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220614152148.252820-1-u.kleine-koenig@pengutronix.de
---
 drivers/mfd/twl-core.c  | 323 +-----------------------------------------------
 include/linux/mfd/twl.h |  55 ---------
 2 files changed, 5 insertions(+), 373 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index bd6659cf3bc0..2cb9326f3e61 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -656,309 +656,6 @@ static inline struct device *add_child(unsigned mod_no, const char *name,
 		can_wakeup, irq0, irq1);
 }
 
-static struct device *
-add_regulator_linked(int num, struct regulator_init_data *pdata,
-		struct regulator_consumer_supply *consumers,
-		unsigned num_consumers, unsigned long features)
-{
-	struct twl_regulator_driver_data drv_data;
-
-	/* regulator framework demands init_data ... */
-	if (!pdata)
-		return NULL;
-
-	if (consumers) {
-		pdata->consumer_supplies = consumers;
-		pdata->num_consumer_supplies = num_consumers;
-	}
-
-	if (pdata->driver_data) {
-		/* If we have existing drv_data, just add the flags */
-		struct twl_regulator_driver_data *tmp;
-		tmp = pdata->driver_data;
-		tmp->features |= features;
-	} else {
-		/* add new driver data struct, used only during init */
-		drv_data.features = features;
-		drv_data.set_voltage = NULL;
-		drv_data.get_voltage = NULL;
-		drv_data.data = NULL;
-		pdata->driver_data = &drv_data;
-	}
-
-	/* NOTE:  we currently ignore regulator IRQs, e.g. for short circuits */
-	return add_numbered_child(TWL_MODULE_PM_MASTER, "twl_reg", num,
-		pdata, sizeof(*pdata), false, 0, 0);
-}
-
-static struct device *
-add_regulator(int num, struct regulator_init_data *pdata,
-		unsigned long features)
-{
-	return add_regulator_linked(num, pdata, NULL, 0, features);
-}
-
-/*
- * NOTE:  We know the first 8 IRQs after pdata->base_irq are
- * for the PIH, and the next are for the PWR_INT SIH, since
- * that's how twl_init_irq() sets things up.
- */
-
-static int
-add_children(struct twl4030_platform_data *pdata, unsigned irq_base,
-		unsigned long features)
-{
-	struct device	*child;
-
-	if (IS_ENABLED(CONFIG_GPIO_TWL4030) && pdata->gpio) {
-		child = add_child(TWL4030_MODULE_GPIO, "twl4030_gpio",
-				pdata->gpio, sizeof(*pdata->gpio),
-				false, irq_base + GPIO_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_KEYBOARD_TWL4030) && pdata->keypad) {
-		child = add_child(TWL4030_MODULE_KEYPAD, "twl4030_keypad",
-				pdata->keypad, sizeof(*pdata->keypad),
-				true, irq_base + KEYPAD_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_MADC) && pdata->madc &&
-	    twl_class_is_4030()) {
-		child = add_child(TWL4030_MODULE_MADC, "twl4030_madc",
-				pdata->madc, sizeof(*pdata->madc),
-				true, irq_base + MADC_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_RTC_DRV_TWL4030)) {
-		/*
-		 * REVISIT platform_data here currently might expose the
-		 * "msecure" line ... but for now we just expect board
-		 * setup to tell the chip "it's always ok to SET_TIME".
-		 * Eventually, Linux might become more aware of such
-		 * HW security concerns, and "least privilege".
-		 */
-		child = add_child(TWL_MODULE_RTC, "twl_rtc", NULL, 0,
-				true, irq_base + RTC_INTR_OFFSET, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_PWM_TWL)) {
-		child = add_child(TWL_MODULE_PWM, "twl-pwm", NULL, 0,
-				  false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_PWM_TWL_LED)) {
-		child = add_child(TWL_MODULE_LED, "twl-pwmled", NULL, 0,
-				  false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_USB) && pdata->usb &&
-	    twl_class_is_4030()) {
-
-		static struct regulator_consumer_supply usb1v5 = {
-			.supply =	"usb1v5",
-		};
-		static struct regulator_consumer_supply usb1v8 = {
-			.supply =	"usb1v8",
-		};
-		static struct regulator_consumer_supply usb3v1 = {
-			.supply =	"usb3v1",
-		};
-
-	/* First add the regulators so that they can be used by transceiver */
-		if (IS_ENABLED(CONFIG_REGULATOR_TWL4030)) {
-			/* this is a template that gets copied */
-			struct regulator_init_data usb_fixed = {
-				.constraints.valid_modes_mask =
-					REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-				.constraints.valid_ops_mask =
-					REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-			};
-
-			child = add_regulator_linked(TWL4030_REG_VUSB1V5,
-						      &usb_fixed, &usb1v5, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-			child = add_regulator_linked(TWL4030_REG_VUSB1V8,
-						      &usb_fixed, &usb1v8, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-			child = add_regulator_linked(TWL4030_REG_VUSB3V1,
-						      &usb_fixed, &usb3v1, 1,
-						      features);
-			if (IS_ERR(child))
-				return PTR_ERR(child);
-
-		}
-
-		child = add_child(TWL_MODULE_USB, "twl4030_usb",
-				pdata->usb, sizeof(*pdata->usb), true,
-				/* irq0 = USB_PRES, irq1 = USB */
-				irq_base + USB_PRES_INTR_OFFSET,
-				irq_base + USB_INTR_OFFSET);
-
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		/* we need to connect regulators to this transceiver */
-		if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && child) {
-			usb1v5.dev_name = dev_name(child);
-			usb1v8.dev_name = dev_name(child);
-			usb3v1.dev_name = dev_name(child);
-		}
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_WATCHDOG) && twl_class_is_4030()) {
-		child = add_child(TWL_MODULE_PM_RECEIVER, "twl4030_wdt", NULL,
-				  0, false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_INPUT_TWL4030_PWRBUTTON) && twl_class_is_4030()) {
-		child = add_child(TWL_MODULE_PM_MASTER, "twl4030_pwrbutton",
-				  NULL, 0, true, irq_base + 8 + 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_MFD_TWL4030_AUDIO) && pdata->audio &&
-	    twl_class_is_4030()) {
-		child = add_child(TWL4030_MODULE_AUDIO_VOICE, "twl4030-audio",
-				pdata->audio, sizeof(*pdata->audio),
-				false, 0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	/* twl4030 regulators */
-	if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && twl_class_is_4030()) {
-		child = add_regulator(TWL4030_REG_VPLL1, pdata->vpll1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VIO, pdata->vio,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDD1, pdata->vdd1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDD2, pdata->vdd2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VMMC1, pdata->vmmc1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VDAC, pdata->vdac,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator((features & TWL4030_VAUX2)
-					? TWL4030_REG_VAUX2_4030
-					: TWL4030_REG_VAUX2,
-				pdata->vaux2, features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTANA1, pdata->vintana1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTANA2, pdata->vintana2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VINTDIG, pdata->vintdig,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	/* maybe add LDOs that are omitted on cost-reduced parts */
-	if (IS_ENABLED(CONFIG_REGULATOR_TWL4030) && !(features & TPS_SUBSET)
-	  && twl_class_is_4030()) {
-		child = add_regulator(TWL4030_REG_VPLL2, pdata->vpll2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VMMC2, pdata->vmmc2,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VSIM, pdata->vsim,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX1, pdata->vaux1,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX3, pdata->vaux3,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-
-		child = add_regulator(TWL4030_REG_VAUX4, pdata->vaux4,
-					features);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_CHARGER_TWL4030) && pdata->bci &&
-			!(features & (TPS_SUBSET | TWL5031))) {
-		child = add_child(TWL_MODULE_MAIN_CHARGE, "twl4030_bci",
-				pdata->bci, sizeof(*pdata->bci), false,
-				/* irq0 = CHG_PRES, irq1 = BCI */
-				irq_base + BCI_PRES_INTR_OFFSET,
-				irq_base + BCI_INTR_OFFSET);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	if (IS_ENABLED(CONFIG_TWL4030_POWER) && pdata->power) {
-		child = add_child(TWL_MODULE_PM_MASTER, "twl4030_power",
-				  pdata->power, sizeof(*pdata->power), false,
-				  0, 0);
-		if (IS_ERR(child))
-			return PTR_ERR(child);
-	}
-
-	return 0;
-}
-
 /*----------------------------------------------------------------------*/
 
 /*
@@ -987,8 +684,7 @@ static inline int unprotect_pm_master(void)
 	return e;
 }
 
-static void clocks_init(struct device *dev,
-			struct twl4030_clock_init_data *clock)
+static void clocks_init(struct device *dev)
 {
 	int e = 0;
 	struct clk *osc;
@@ -1018,8 +714,6 @@ static void clocks_init(struct device *dev,
 	}
 
 	ctrl |= HIGH_PERF_SQ;
-	if (clock && clock->ck32k_lowpwr_enable)
-		ctrl |= CK32K_LOWPWR_EN;
 
 	e |= unprotect_pm_master();
 	/* effect->MADC+USB ck en */
@@ -1063,7 +757,6 @@ static struct of_dev_auxdata twl_auxdata_lookup[] = {
 static int
 twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 {
-	struct twl4030_platform_data	*pdata = dev_get_platdata(&client->dev);
 	struct device_node		*node = client->dev.of_node;
 	struct platform_device		*pdev;
 	const struct regmap_config	*twl_regmap_config;
@@ -1071,7 +764,7 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	int				status;
 	unsigned			i, num_slaves;
 
-	if (!node && !pdata) {
+	if (!node) {
 		dev_err(&client->dev, "no platform data\n");
 		return -EINVAL;
 	}
@@ -1161,7 +854,7 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	twl_priv->ready = true;
 
 	/* setup clock framework */
-	clocks_init(&client->dev, pdata ? pdata->clock : NULL);
+	clocks_init(&client->dev);
 
 	/* read TWL IDCODE Register */
 	if (twl_class_is_4030()) {
@@ -1209,14 +902,8 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id)
 				 TWL4030_DCDC_GLOBAL_CFG);
 	}
 
-	if (node) {
-		if (pdata)
-			twl_auxdata_lookup[0].platform_data = pdata->gpio;
-		status = of_platform_populate(node, NULL, twl_auxdata_lookup,
-					      &client->dev);
-	} else {
-		status = add_children(pdata, irq_base, id->driver_data);
-	}
+	status = of_platform_populate(node, NULL, twl_auxdata_lookup,
+				      &client->dev);
 
 fail:
 	if (status < 0)
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 8871cc5188a0..e426938eafd5 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -694,61 +694,6 @@ struct twl4030_audio_data {
 	unsigned int irq_base;
 };
 
-struct twl4030_platform_data {
-	struct twl4030_clock_init_data		*clock;
-	struct twl4030_bci_platform_data	*bci;
-	struct twl4030_gpio_platform_data	*gpio;
-	struct twl4030_madc_platform_data	*madc;
-	struct twl4030_keypad_data		*keypad;
-	struct twl4030_usb_data			*usb;
-	struct twl4030_power_data		*power;
-	struct twl4030_audio_data		*audio;
-
-	/* Common LDO regulators for TWL4030/TWL6030 */
-	struct regulator_init_data		*vdac;
-	struct regulator_init_data		*vaux1;
-	struct regulator_init_data		*vaux2;
-	struct regulator_init_data		*vaux3;
-	struct regulator_init_data		*vdd1;
-	struct regulator_init_data		*vdd2;
-	struct regulator_init_data		*vdd3;
-	/* TWL4030 LDO regulators */
-	struct regulator_init_data		*vpll1;
-	struct regulator_init_data		*vpll2;
-	struct regulator_init_data		*vmmc1;
-	struct regulator_init_data		*vmmc2;
-	struct regulator_init_data		*vsim;
-	struct regulator_init_data		*vaux4;
-	struct regulator_init_data		*vio;
-	struct regulator_init_data		*vintana1;
-	struct regulator_init_data		*vintana2;
-	struct regulator_init_data		*vintdig;
-	/* TWL6030 LDO regulators */
-	struct regulator_init_data              *vmmc;
-	struct regulator_init_data              *vpp;
-	struct regulator_init_data              *vusim;
-	struct regulator_init_data              *vana;
-	struct regulator_init_data              *vcxio;
-	struct regulator_init_data              *vusb;
-	struct regulator_init_data		*clk32kg;
-	struct regulator_init_data              *v1v8;
-	struct regulator_init_data              *v2v1;
-	/* TWL6032 LDO regulators */
-	struct regulator_init_data		*ldo1;
-	struct regulator_init_data		*ldo2;
-	struct regulator_init_data		*ldo3;
-	struct regulator_init_data		*ldo4;
-	struct regulator_init_data		*ldo5;
-	struct regulator_init_data		*ldo6;
-	struct regulator_init_data		*ldo7;
-	struct regulator_init_data		*ldoln;
-	struct regulator_init_data		*ldousb;
-	/* TWL6032 DCDC regulators */
-	struct regulator_init_data		*smps3;
-	struct regulator_init_data		*smps4;
-	struct regulator_init_data		*vio6025;
-};
-
 struct twl_regulator_driver_data {
 	int		(*set_voltage)(void *data, int target_uV);
 	int		(*get_voltage)(void *data);
-- 
cgit v1.2.3


From c55333064d6ea9a26c7e8cfbe2ba1fa77c3a8e48 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 19 Jun 2022 10:26:55 +0200
Subject: mfd: tc6393xb: Make disable callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All implementations return 0, so simplify accordingly.

This is a preparation for making platform remove callbacks return void.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220619082655.53728-1-u.kleine-koenig@pengutronix.de
---
 arch/arm/mach-pxa/eseries.c  | 3 +--
 arch/arm/mach-pxa/tosa.c     | 4 +---
 drivers/mfd/tc6393xb.c       | 5 ++---
 include/linux/mfd/tc6393xb.h | 2 +-
 4 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index 99781eec065e..6226aee152d3 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -86,11 +86,10 @@ int eseries_tmio_enable(struct platform_device *dev)
 	return 0;
 }
 
-int eseries_tmio_disable(struct platform_device *dev)
+void eseries_tmio_disable(struct platform_device *dev)
 {
 	gpio_set_value(GPIO_ESERIES_TMIO_SUSPEND, 0);
 	gpio_set_value(GPIO_ESERIES_TMIO_PCLR, 0);
-	return 0;
 }
 
 int eseries_tmio_suspend(struct platform_device *dev)
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 6af8bc404825..d41641d6cfcd 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -678,13 +678,11 @@ err_req_pclr:
 	return rc;
 }
 
-static int tosa_tc6393xb_disable(struct platform_device *dev)
+static void tosa_tc6393xb_disable(struct platform_device *dev)
 {
 	gpio_free(TOSA_GPIO_TC6393XB_L3V_ON);
 	gpio_free(TOSA_GPIO_TC6393XB_SUSPEND);
 	gpio_free(TOSA_GPIO_TC6393XB_REST_IN);
-
-	return 0;
 }
 
 static int tosa_tc6393xb_resume(struct platform_device *dev)
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 0be5731685b4..aa903a31dd43 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -798,20 +798,19 @@ static int tc6393xb_remove(struct platform_device *dev)
 {
 	struct tc6393xb_platform_data *tcpd = dev_get_platdata(&dev->dev);
 	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
-	int ret;
 
 	mfd_remove_devices(&dev->dev);
 
 	tc6393xb_detach_irq(dev);
 
-	ret = tcpd->disable(dev);
+	tcpd->disable(dev);
 	clk_disable_unprepare(tc6393xb->clk);
 	iounmap(tc6393xb->scr);
 	release_resource(&tc6393xb->rscr);
 	clk_put(tc6393xb->clk);
 	kfree(tc6393xb);
 
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index d336c541b7df..d17807f2d0c9 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -22,7 +22,7 @@ struct tc6393xb_platform_data {
 	u16	scr_gper;	/* GP Enable */
 
 	int	(*enable)(struct platform_device *dev);
-	int	(*disable)(struct platform_device *dev);
+	void	(*disable)(struct platform_device *dev);
 	int	(*suspend)(struct platform_device *dev);
 	int	(*resume)(struct platform_device *dev);
 
-- 
cgit v1.2.3


From 79f821b5a3bf46d2d5ee2dc0e2b2428b1062a040 Mon Sep 17 00:00:00 2001
From: Zhang Jiaming <jiaming@nfschina.com>
Date: Thu, 23 Jun 2022 15:33:33 +0800
Subject: mfd: ipaq-micro: Fix spelling mistake of "receive{d}"

Change 'receieved' to 'received' and 'recieve' to 'receive'.

Signed-off-by: Zhang Jiaming <jiaming@nfschina.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220623073333.5675-1-jiaming@nfschina.com
---
 include/linux/mfd/ipaq-micro.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/ipaq-micro.h b/include/linux/mfd/ipaq-micro.h
index ee48a4321c57..d5caa4c86ecc 100644
--- a/include/linux/mfd/ipaq-micro.h
+++ b/include/linux/mfd/ipaq-micro.h
@@ -75,8 +75,8 @@ struct ipaq_micro_rxdev {
  * @id: 4-bit ID of the message
  * @tx_len: length of TX data
  * @tx_data: TX data to send
- * @rx_len: length of receieved RX data
- * @rx_data: RX data to recieve
+ * @rx_len: length of received RX data
+ * @rx_data: RX data to receive
  * @ack: a completion that will be completed when RX is complete
  * @node: list node if message gets queued
  */
-- 
cgit v1.2.3


From d9cd0bc604705eb01497c3e483f8820a9677c682 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Mon, 27 Jun 2022 14:39:54 +0200
Subject: mfd: mt6397: Add basic support for MT6331+MT6332 PMIC

Add support for the MT6331 PMIC with MT6332 Companion PMIC, found
in MT6795 Helio X10 smartphone platforms.

This combo has support for multiple devices but, for a start,
only the following have been implemented:
- Regulators (two instances, one in MT6331, one in MT6332)
- RTC (MT6331)
- Keys (MT6331)
- Interrupts (MT6331 also dispatches MT6332's interrupts)

There's more to be implemented, especially for MT6332, which
will come at a later stage.

Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Link: https://lore.kernel.org/r/20220627123954.64299-1-angelogioacchino.delregno@collabora.com
---
 drivers/mfd/mt6397-core.c            |  47 +++
 drivers/mfd/mt6397-irq.c             |   9 +-
 include/linux/mfd/mt6331/core.h      |  40 +++
 include/linux/mfd/mt6331/registers.h | 584 +++++++++++++++++++++++++++++++
 include/linux/mfd/mt6332/core.h      |  65 ++++
 include/linux/mfd/mt6332/registers.h | 642 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/mt6397/core.h      |   2 +
 7 files changed, 1388 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mfd/mt6331/core.h
 create mode 100644 include/linux/mfd/mt6331/registers.h
 create mode 100644 include/linux/mfd/mt6332/core.h
 create mode 100644 include/linux/mfd/mt6332/registers.h

(limited to 'include/linux')

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 3cb8836bd08d..f6c1f80f94a4 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -12,11 +12,13 @@
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6331/core.h>
 #include <linux/mfd/mt6357/core.h>
 #include <linux/mfd/mt6358/core.h>
 #include <linux/mfd/mt6359/core.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6357/registers.h>
 #include <linux/mfd/mt6358/registers.h>
 #include <linux/mfd/mt6359/registers.h>
@@ -28,6 +30,9 @@
 #define MT6357_RTC_BASE		0x0588
 #define MT6357_RTC_SIZE		0x3c
 
+#define MT6331_RTC_BASE		0x4000
+#define MT6331_RTC_SIZE		0x40
+
 #define MT6358_RTC_BASE		0x0588
 #define MT6358_RTC_SIZE		0x3c
 
@@ -47,6 +52,11 @@ static const struct resource mt6357_rtc_resources[] = {
 	DEFINE_RES_IRQ(MT6357_IRQ_RTC),
 };
 
+static const struct resource mt6331_rtc_resources[] = {
+	DEFINE_RES_MEM(MT6331_RTC_BASE, MT6331_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6331_IRQ_STATUS_RTC),
+};
+
 static const struct resource mt6358_rtc_resources[] = {
 	DEFINE_RES_MEM(MT6358_RTC_BASE, MT6358_RTC_SIZE),
 	DEFINE_RES_IRQ(MT6358_IRQ_RTC),
@@ -83,6 +93,11 @@ static const struct resource mt6357_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6357_IRQ_HOMEKEY_R, "homekey_r"),
 };
 
+static const struct resource mt6331_keys_resources[] = {
+	DEFINE_RES_IRQ_NAMED(MT6331_IRQ_STATUS_PWRKEY, "powerkey"),
+	DEFINE_RES_IRQ_NAMED(MT6331_IRQ_STATUS_HOMEKEY, "homekey"),
+};
+
 static const struct resource mt6397_keys_resources[] = {
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_PWRKEY, "powerkey"),
 	DEFINE_RES_IRQ_NAMED(MT6397_IRQ_HOMEKEY, "homekey"),
@@ -133,6 +148,27 @@ static const struct mfd_cell mt6357_devs[] = {
 	},
 };
 
+/* MT6331 is always used in combination with MT6332 */
+static const struct mfd_cell mt6331_mt6332_devs[] = {
+	{
+		.name = "mt6331-rtc",
+		.num_resources = ARRAY_SIZE(mt6331_rtc_resources),
+		.resources = mt6331_rtc_resources,
+		.of_compatible = "mediatek,mt6331-rtc",
+	}, {
+		.name = "mt6331-regulator",
+		.of_compatible = "mediatek,mt6331-regulator"
+	}, {
+		.name = "mt6332-regulator",
+		.of_compatible = "mediatek,mt6332-regulator"
+	}, {
+		.name = "mtk-pmic-keys",
+		.num_resources = ARRAY_SIZE(mt6331_keys_resources),
+		.resources = mt6331_keys_resources,
+		.of_compatible = "mediatek,mt6331-keys"
+	},
+};
+
 static const struct mfd_cell mt6358_devs[] = {
 	{
 		.name = "mt6358-regulator",
@@ -220,6 +256,14 @@ static const struct chip_data mt6357_core = {
 	.irq_init = mt6358_irq_init,
 };
 
+static const struct chip_data mt6331_mt6332_core = {
+	.cid_addr = MT6331_HWCID,
+	.cid_shift = 0,
+	.cells = mt6331_mt6332_devs,
+	.cell_size = ARRAY_SIZE(mt6331_mt6332_devs),
+	.irq_init = mt6397_irq_init,
+};
+
 static const struct chip_data mt6358_core = {
 	.cid_addr = MT6358_SWCID,
 	.cid_shift = 8,
@@ -302,6 +346,9 @@ static const struct of_device_id mt6397_of_match[] = {
 	{
 		.compatible = "mediatek,mt6323",
 		.data = &mt6323_core,
+	}, {
+		.compatible = "mediatek,mt6331",
+		.data = &mt6331_mt6332_core,
 	}, {
 		.compatible = "mediatek,mt6357",
 		.data = &mt6357_core,
diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c
index 2924919da991..eff53fed8fe7 100644
--- a/drivers/mfd/mt6397-irq.c
+++ b/drivers/mfd/mt6397-irq.c
@@ -12,6 +12,8 @@
 #include <linux/suspend.h>
 #include <linux/mfd/mt6323/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6331/core.h>
+#include <linux/mfd/mt6331/registers.h>
 #include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6397/registers.h>
 
@@ -172,7 +174,12 @@ int mt6397_irq_init(struct mt6397_chip *chip)
 		chip->int_status[0] = MT6323_INT_STATUS0;
 		chip->int_status[1] = MT6323_INT_STATUS1;
 		break;
-
+	case MT6331_CHIP_ID:
+		chip->int_con[0] = MT6331_INT_CON0;
+		chip->int_con[1] = MT6331_INT_CON1;
+		chip->int_status[0] = MT6331_INT_STATUS_CON0;
+		chip->int_status[1] = MT6331_INT_STATUS_CON1;
+		break;
 	case MT6391_CHIP_ID:
 	case MT6397_CHIP_ID:
 		chip->int_con[0] = MT6397_INT_CON0;
diff --git a/include/linux/mfd/mt6331/core.h b/include/linux/mfd/mt6331/core.h
new file mode 100644
index 000000000000..df8e6b1e4bc1
--- /dev/null
+++ b/include/linux/mfd/mt6331/core.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6331_CORE_H__
+#define __MFD_MT6331_CORE_H__
+
+enum mt6331_irq_status_numbers {
+	MT6331_IRQ_STATUS_PWRKEY = 0,
+	MT6331_IRQ_STATUS_HOMEKEY,
+	MT6331_IRQ_STATUS_CHRDET,
+	MT6331_IRQ_STATUS_THR_H,
+	MT6331_IRQ_STATUS_THR_L,
+	MT6331_IRQ_STATUS_BAT_H,
+	MT6331_IRQ_STATUS_BAT_L,
+	MT6331_IRQ_STATUS_RTC,
+	MT6331_IRQ_STATUS_AUDIO,
+	MT6331_IRQ_STATUS_MAD,
+	MT6331_IRQ_STATUS_ACCDET,
+	MT6331_IRQ_STATUS_ACCDET_EINT,
+	MT6331_IRQ_STATUS_ACCDET_NEGV = 12,
+	MT6331_IRQ_STATUS_VDVFS11_OC = 16,
+	MT6331_IRQ_STATUS_VDVFS12_OC,
+	MT6331_IRQ_STATUS_VDVFS13_OC,
+	MT6331_IRQ_STATUS_VDVFS14_OC,
+	MT6331_IRQ_STATUS_GPU_OC,
+	MT6331_IRQ_STATUS_VCORE1_OC,
+	MT6331_IRQ_STATUS_VCORE2_OC,
+	MT6331_IRQ_STATUS_VIO18_OC,
+	MT6331_IRQ_STATUS_LDO_OC,
+	MT6331_IRQ_STATUS_NR,
+};
+
+#define MT6331_IRQ_CON0_BASE	MT6331_IRQ_STATUS_PWRKEY
+#define MT6331_IRQ_CON0_BITS	(MT6331_IRQ_STATUS_ACCDET_NEGV + 1)
+#define MT6331_IRQ_CON1_BASE	MT6331_IRQ_STATUS_VDVFS11_OC
+#define MT6331_IRQ_CON1_BITS	(MT6331_IRQ_STATUS_LDO_OC - MT6331_IRQ_STATUS_VDFS11_OC + 1)
+
+#endif /* __MFD_MT6331_CORE_H__ */
diff --git a/include/linux/mfd/mt6331/registers.h b/include/linux/mfd/mt6331/registers.h
new file mode 100644
index 000000000000..e2be6bccd1a7
--- /dev/null
+++ b/include/linux/mfd/mt6331/registers.h
@@ -0,0 +1,584 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6331_REGISTERS_H__
+#define __MFD_MT6331_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6331_STRUP_CON0		0x0
+#define MT6331_STRUP_CON2		0x2
+#define MT6331_STRUP_CON3		0x4
+#define MT6331_STRUP_CON4		0x6
+#define MT6331_STRUP_CON5		0x8
+#define MT6331_STRUP_CON6		0xA
+#define MT6331_STRUP_CON7		0xC
+#define MT6331_STRUP_CON8		0xE
+#define MT6331_STRUP_CON9		0x10
+#define MT6331_STRUP_CON10		0x12
+#define MT6331_STRUP_CON11		0x14
+#define MT6331_STRUP_CON12		0x16
+#define MT6331_STRUP_CON13		0x18
+#define MT6331_STRUP_CON14		0x1A
+#define MT6331_STRUP_CON15		0x1C
+#define MT6331_STRUP_CON16		0x1E
+#define MT6331_STRUP_CON17		0x20
+#define MT6331_STRUP_CON18		0x22
+#define MT6331_HWCID			0x100
+#define MT6331_SWCID			0x102
+#define MT6331_EXT_PMIC_STATUS		0x104
+#define MT6331_TOP_CON			0x106
+#define MT6331_TEST_OUT			0x108
+#define MT6331_TEST_CON0		0x10A
+#define MT6331_TEST_CON1		0x10C
+#define MT6331_TESTMODE_SW		0x10E
+#define MT6331_EN_STATUS0		0x110
+#define MT6331_EN_STATUS1		0x112
+#define MT6331_EN_STATUS2		0x114
+#define MT6331_OCSTATUS0		0x116
+#define MT6331_OCSTATUS1		0x118
+#define MT6331_OCSTATUS2		0x11A
+#define MT6331_PGSTATUS			0x11C
+#define MT6331_TOPSTATUS		0x11E
+#define MT6331_TDSEL_CON		0x120
+#define MT6331_RDSEL_CON		0x122
+#define MT6331_SMT_CON0			0x124
+#define MT6331_SMT_CON1			0x126
+#define MT6331_SMT_CON2			0x128
+#define MT6331_DRV_CON0			0x12A
+#define MT6331_DRV_CON1			0x12C
+#define MT6331_DRV_CON2			0x12E
+#define MT6331_DRV_CON3			0x130
+#define MT6331_TOP_STATUS		0x132
+#define MT6331_TOP_STATUS_SET		0x134
+#define MT6331_TOP_STATUS_CLR		0x136
+#define MT6331_TOP_CKPDN_CON0		0x138
+#define MT6331_TOP_CKPDN_CON0_SET	0x13A
+#define MT6331_TOP_CKPDN_CON0_CLR	0x13C
+#define MT6331_TOP_CKPDN_CON1		0x13E
+#define MT6331_TOP_CKPDN_CON1_SET	0x140
+#define MT6331_TOP_CKPDN_CON1_CLR	0x142
+#define MT6331_TOP_CKPDN_CON2		0x144
+#define MT6331_TOP_CKPDN_CON2_SET	0x146
+#define MT6331_TOP_CKPDN_CON2_CLR	0x148
+#define MT6331_TOP_CKSEL_CON		0x14A
+#define MT6331_TOP_CKSEL_CON_SET	0x14C
+#define MT6331_TOP_CKSEL_CON_CLR	0x14E
+#define MT6331_TOP_CKHWEN_CON		0x150
+#define MT6331_TOP_CKHWEN_CON_SET	0x152
+#define MT6331_TOP_CKHWEN_CON_CLR	0x154
+#define MT6331_TOP_CKTST_CON0		0x156
+#define MT6331_TOP_CKTST_CON1		0x158
+#define MT6331_TOP_CLKSQ		0x15A
+#define MT6331_TOP_CLKSQ_SET		0x15C
+#define MT6331_TOP_CLKSQ_CLR		0x15E
+#define MT6331_TOP_RST_CON		0x160
+#define MT6331_TOP_RST_CON_SET		0x162
+#define MT6331_TOP_RST_CON_CLR		0x164
+#define MT6331_TOP_RST_MISC		0x166
+#define MT6331_TOP_RST_MISC_SET		0x168
+#define MT6331_TOP_RST_MISC_CLR		0x16A
+#define MT6331_INT_CON0			0x16C
+#define MT6331_INT_CON0_SET		0x16E
+#define MT6331_INT_CON0_CLR		0x170
+#define MT6331_INT_CON1			0x172
+#define MT6331_INT_CON1_SET		0x174
+#define MT6331_INT_CON1_CLR		0x176
+#define MT6331_INT_MISC_CON		0x178
+#define MT6331_INT_MISC_CON_SET		0x17A
+#define MT6331_INT_MISC_CON_CLR		0x17C
+#define MT6331_INT_STATUS_CON0		0x17E
+#define MT6331_INT_STATUS_CON1		0x180
+#define MT6331_OC_GEAR_0		0x182
+#define MT6331_FQMTR_CON0		0x184
+#define MT6331_FQMTR_CON1		0x186
+#define MT6331_FQMTR_CON2		0x188
+#define MT6331_RG_SPI_CON		0x18A
+#define MT6331_DEW_DIO_EN		0x18C
+#define MT6331_DEW_READ_TEST		0x18E
+#define MT6331_DEW_WRITE_TEST		0x190
+#define MT6331_DEW_CRC_SWRST		0x192
+#define MT6331_DEW_CRC_EN		0x194
+#define MT6331_DEW_CRC_VAL		0x196
+#define MT6331_DEW_DBG_MON_SEL		0x198
+#define MT6331_DEW_CIPHER_KEY_SEL	0x19A
+#define MT6331_DEW_CIPHER_IV_SEL	0x19C
+#define MT6331_DEW_CIPHER_EN		0x19E
+#define MT6331_DEW_CIPHER_RDY		0x1A0
+#define MT6331_DEW_CIPHER_MODE		0x1A2
+#define MT6331_DEW_CIPHER_SWRST		0x1A4
+#define MT6331_DEW_RDDMY_NO		0x1A6
+#define MT6331_INT_TYPE_CON0		0x1A8
+#define MT6331_INT_TYPE_CON0_SET	0x1AA
+#define MT6331_INT_TYPE_CON0_CLR	0x1AC
+#define MT6331_INT_TYPE_CON1		0x1AE
+#define MT6331_INT_TYPE_CON1_SET	0x1B0
+#define MT6331_INT_TYPE_CON1_CLR	0x1B2
+#define MT6331_INT_STA			0x1B4
+#define MT6331_BUCK_ALL_CON0		0x200
+#define MT6331_BUCK_ALL_CON1		0x202
+#define MT6331_BUCK_ALL_CON2		0x204
+#define MT6331_BUCK_ALL_CON3		0x206
+#define MT6331_BUCK_ALL_CON4		0x208
+#define MT6331_BUCK_ALL_CON5		0x20A
+#define MT6331_BUCK_ALL_CON6		0x20C
+#define MT6331_BUCK_ALL_CON7		0x20E
+#define MT6331_BUCK_ALL_CON8		0x210
+#define MT6331_BUCK_ALL_CON9		0x212
+#define MT6331_BUCK_ALL_CON10		0x214
+#define MT6331_BUCK_ALL_CON11		0x216
+#define MT6331_BUCK_ALL_CON12		0x218
+#define MT6331_BUCK_ALL_CON13		0x21A
+#define MT6331_BUCK_ALL_CON14		0x21C
+#define MT6331_BUCK_ALL_CON15		0x21E
+#define MT6331_BUCK_ALL_CON16		0x220
+#define MT6331_BUCK_ALL_CON17		0x222
+#define MT6331_BUCK_ALL_CON18		0x224
+#define MT6331_BUCK_ALL_CON19		0x226
+#define MT6331_BUCK_ALL_CON20		0x228
+#define MT6331_BUCK_ALL_CON21		0x22A
+#define MT6331_BUCK_ALL_CON22		0x22C
+#define MT6331_BUCK_ALL_CON23		0x22E
+#define MT6331_BUCK_ALL_CON24		0x230
+#define MT6331_BUCK_ALL_CON25		0x232
+#define MT6331_BUCK_ALL_CON26		0x234
+#define MT6331_VDVFS11_CON0		0x236
+#define MT6331_VDVFS11_CON1		0x238
+#define MT6331_VDVFS11_CON2		0x23A
+#define MT6331_VDVFS11_CON3		0x23C
+#define MT6331_VDVFS11_CON4		0x23E
+#define MT6331_VDVFS11_CON5		0x240
+#define MT6331_VDVFS11_CON6		0x242
+#define MT6331_VDVFS11_CON7		0x244
+#define MT6331_VDVFS11_CON8		0x246
+#define MT6331_VDVFS11_CON9		0x248
+#define MT6331_VDVFS11_CON10		0x24A
+#define MT6331_VDVFS11_CON11		0x24C
+#define MT6331_VDVFS11_CON12		0x24E
+#define MT6331_VDVFS11_CON13		0x250
+#define MT6331_VDVFS11_CON14		0x252
+#define MT6331_VDVFS11_CON18		0x25A
+#define MT6331_VDVFS11_CON19		0x25C
+#define MT6331_VDVFS11_CON20		0x25E
+#define MT6331_VDVFS11_CON21		0x260
+#define MT6331_VDVFS11_CON22		0x262
+#define MT6331_VDVFS11_CON23		0x264
+#define MT6331_VDVFS11_CON24		0x266
+#define MT6331_VDVFS11_CON25		0x268
+#define MT6331_VDVFS11_CON26		0x26A
+#define MT6331_VDVFS11_CON27		0x26C
+#define MT6331_VDVFS12_CON0		0x26E
+#define MT6331_VDVFS12_CON1		0x270
+#define MT6331_VDVFS12_CON2		0x272
+#define MT6331_VDVFS12_CON3		0x274
+#define MT6331_VDVFS12_CON4		0x276
+#define MT6331_VDVFS12_CON5		0x278
+#define MT6331_VDVFS12_CON6		0x27A
+#define MT6331_VDVFS12_CON7		0x27C
+#define MT6331_VDVFS12_CON8		0x27E
+#define MT6331_VDVFS12_CON9		0x280
+#define MT6331_VDVFS12_CON10		0x282
+#define MT6331_VDVFS12_CON11		0x284
+#define MT6331_VDVFS12_CON12		0x286
+#define MT6331_VDVFS12_CON13		0x288
+#define MT6331_VDVFS12_CON14		0x28A
+#define MT6331_VDVFS12_CON18		0x292
+#define MT6331_VDVFS12_CON19		0x294
+#define MT6331_VDVFS12_CON20		0x296
+#define MT6331_VDVFS13_CON0		0x298
+#define MT6331_VDVFS13_CON1		0x29A
+#define MT6331_VDVFS13_CON2		0x29C
+#define MT6331_VDVFS13_CON3		0x29E
+#define MT6331_VDVFS13_CON4		0x2A0
+#define MT6331_VDVFS13_CON5		0x2A2
+#define MT6331_VDVFS13_CON6		0x2A4
+#define MT6331_VDVFS13_CON7		0x2A6
+#define MT6331_VDVFS13_CON8		0x2A8
+#define MT6331_VDVFS13_CON9		0x2AA
+#define MT6331_VDVFS13_CON10		0x2AC
+#define MT6331_VDVFS13_CON11		0x2AE
+#define MT6331_VDVFS13_CON12		0x2B0
+#define MT6331_VDVFS13_CON13		0x2B2
+#define MT6331_VDVFS13_CON14		0x2B4
+#define MT6331_VDVFS13_CON18		0x2BC
+#define MT6331_VDVFS13_CON19		0x2BE
+#define MT6331_VDVFS13_CON20		0x2C0
+#define MT6331_VDVFS14_CON0		0x2C2
+#define MT6331_VDVFS14_CON1		0x2C4
+#define MT6331_VDVFS14_CON2		0x2C6
+#define MT6331_VDVFS14_CON3		0x2C8
+#define MT6331_VDVFS14_CON4		0x2CA
+#define MT6331_VDVFS14_CON5		0x2CC
+#define MT6331_VDVFS14_CON6		0x2CE
+#define MT6331_VDVFS14_CON7		0x2D0
+#define MT6331_VDVFS14_CON8		0x2D2
+#define MT6331_VDVFS14_CON9		0x2D4
+#define MT6331_VDVFS14_CON10		0x2D6
+#define MT6331_VDVFS14_CON11		0x2D8
+#define MT6331_VDVFS14_CON12		0x2DA
+#define MT6331_VDVFS14_CON13		0x2DC
+#define MT6331_VDVFS14_CON14		0x2DE
+#define MT6331_VDVFS14_CON18		0x2E6
+#define MT6331_VDVFS14_CON19		0x2E8
+#define MT6331_VDVFS14_CON20		0x2EA
+#define MT6331_VGPU_CON0		0x300
+#define MT6331_VGPU_CON1		0x302
+#define MT6331_VGPU_CON2		0x304
+#define MT6331_VGPU_CON3		0x306
+#define MT6331_VGPU_CON4		0x308
+#define MT6331_VGPU_CON5		0x30A
+#define MT6331_VGPU_CON6		0x30C
+#define MT6331_VGPU_CON7		0x30E
+#define MT6331_VGPU_CON8		0x310
+#define MT6331_VGPU_CON9		0x312
+#define MT6331_VGPU_CON10		0x314
+#define MT6331_VGPU_CON11		0x316
+#define MT6331_VGPU_CON12		0x318
+#define MT6331_VGPU_CON13		0x31A
+#define MT6331_VGPU_CON14		0x31C
+#define MT6331_VGPU_CON15		0x31E
+#define MT6331_VGPU_CON16		0x320
+#define MT6331_VGPU_CON17		0x322
+#define MT6331_VGPU_CON18		0x324
+#define MT6331_VGPU_CON19		0x326
+#define MT6331_VGPU_CON20		0x328
+#define MT6331_VCORE1_CON0		0x32A
+#define MT6331_VCORE1_CON1		0x32C
+#define MT6331_VCORE1_CON2		0x32E
+#define MT6331_VCORE1_CON3		0x330
+#define MT6331_VCORE1_CON4		0x332
+#define MT6331_VCORE1_CON5		0x334
+#define MT6331_VCORE1_CON6		0x336
+#define MT6331_VCORE1_CON7		0x338
+#define MT6331_VCORE1_CON8		0x33A
+#define MT6331_VCORE1_CON9		0x33C
+#define MT6331_VCORE1_CON10		0x33E
+#define MT6331_VCORE1_CON11		0x340
+#define MT6331_VCORE1_CON12		0x342
+#define MT6331_VCORE1_CON13		0x344
+#define MT6331_VCORE1_CON14		0x346
+#define MT6331_VCORE1_CON15		0x348
+#define MT6331_VCORE1_CON16		0x34A
+#define MT6331_VCORE1_CON17		0x34C
+#define MT6331_VCORE1_CON18		0x34E
+#define MT6331_VCORE1_CON19		0x350
+#define MT6331_VCORE1_CON20		0x352
+#define MT6331_VCORE2_CON0		0x354
+#define MT6331_VCORE2_CON1		0x356
+#define MT6331_VCORE2_CON2		0x358
+#define MT6331_VCORE2_CON3		0x35A
+#define MT6331_VCORE2_CON4		0x35C
+#define MT6331_VCORE2_CON5		0x35E
+#define MT6331_VCORE2_CON6		0x360
+#define MT6331_VCORE2_CON7		0x362
+#define MT6331_VCORE2_CON8		0x364
+#define MT6331_VCORE2_CON9		0x366
+#define MT6331_VCORE2_CON10		0x368
+#define MT6331_VCORE2_CON11		0x36A
+#define MT6331_VCORE2_CON12		0x36C
+#define MT6331_VCORE2_CON13		0x36E
+#define MT6331_VCORE2_CON14		0x370
+#define MT6331_VCORE2_CON15		0x372
+#define MT6331_VCORE2_CON16		0x374
+#define MT6331_VCORE2_CON17		0x376
+#define MT6331_VCORE2_CON18		0x378
+#define MT6331_VCORE2_CON19		0x37A
+#define MT6331_VCORE2_CON20		0x37C
+#define MT6331_VCORE2_CON21		0x37E
+#define MT6331_VIO18_CON0		0x380
+#define MT6331_VIO18_CON1		0x382
+#define MT6331_VIO18_CON2		0x384
+#define MT6331_VIO18_CON3		0x386
+#define MT6331_VIO18_CON4		0x388
+#define MT6331_VIO18_CON5		0x38A
+#define MT6331_VIO18_CON6		0x38C
+#define MT6331_VIO18_CON7		0x38E
+#define MT6331_VIO18_CON8		0x390
+#define MT6331_VIO18_CON9		0x392
+#define MT6331_VIO18_CON10		0x394
+#define MT6331_VIO18_CON11		0x396
+#define MT6331_VIO18_CON12		0x398
+#define MT6331_VIO18_CON13		0x39A
+#define MT6331_VIO18_CON14		0x39C
+#define MT6331_VIO18_CON15		0x39E
+#define MT6331_VIO18_CON16		0x3A0
+#define MT6331_VIO18_CON17		0x3A2
+#define MT6331_VIO18_CON18		0x3A4
+#define MT6331_VIO18_CON19		0x3A6
+#define MT6331_VIO18_CON20		0x3A8
+#define MT6331_BUCK_K_CON0		0x3AA
+#define MT6331_BUCK_K_CON1		0x3AC
+#define MT6331_BUCK_K_CON2		0x3AE
+#define MT6331_BUCK_K_CON3		0x3B0
+#define MT6331_ZCD_CON0			0x400
+#define MT6331_ZCD_CON1			0x402
+#define MT6331_ZCD_CON2			0x404
+#define MT6331_ZCD_CON3			0x406
+#define MT6331_ZCD_CON4			0x408
+#define MT6331_ZCD_CON5			0x40A
+#define MT6331_ISINK0_CON0		0x40C
+#define MT6331_ISINK0_CON1		0x40E
+#define MT6331_ISINK0_CON2		0x410
+#define MT6331_ISINK0_CON3		0x412
+#define MT6331_ISINK0_CON4		0x414
+#define MT6331_ISINK1_CON0		0x416
+#define MT6331_ISINK1_CON1		0x418
+#define MT6331_ISINK1_CON2		0x41A
+#define MT6331_ISINK1_CON3		0x41C
+#define MT6331_ISINK1_CON4		0x41E
+#define MT6331_ISINK2_CON0		0x420
+#define MT6331_ISINK2_CON1		0x422
+#define MT6331_ISINK2_CON2		0x424
+#define MT6331_ISINK2_CON3		0x426
+#define MT6331_ISINK2_CON4		0x428
+#define MT6331_ISINK3_CON0		0x42A
+#define MT6331_ISINK3_CON1		0x42C
+#define MT6331_ISINK3_CON2		0x42E
+#define MT6331_ISINK3_CON3		0x430
+#define MT6331_ISINK3_CON4		0x432
+#define MT6331_ISINK_ANA0		0x434
+#define MT6331_ISINK_ANA1		0x436
+#define MT6331_ISINK_PHASE_DLY		0x438
+#define MT6331_ISINK_EN_CTRL		0x43A
+#define MT6331_ANALDO_CON0		0x500
+#define MT6331_ANALDO_CON1		0x502
+#define MT6331_ANALDO_CON2		0x504
+#define MT6331_ANALDO_CON3		0x506
+#define MT6331_ANALDO_CON4		0x508
+#define MT6331_ANALDO_CON5		0x50A
+#define MT6331_ANALDO_CON6		0x50C
+#define MT6331_ANALDO_CON7		0x50E
+#define MT6331_ANALDO_CON8		0x510
+#define MT6331_ANALDO_CON9		0x512
+#define MT6331_ANALDO_CON10		0x514
+#define MT6331_ANALDO_CON11		0x516
+#define MT6331_ANALDO_CON12		0x518
+#define MT6331_ANALDO_CON13		0x51A
+#define MT6331_SYSLDO_CON0		0x51C
+#define MT6331_SYSLDO_CON1		0x51E
+#define MT6331_SYSLDO_CON2		0x520
+#define MT6331_SYSLDO_CON3		0x522
+#define MT6331_SYSLDO_CON4		0x524
+#define MT6331_SYSLDO_CON5		0x526
+#define MT6331_SYSLDO_CON6		0x528
+#define MT6331_SYSLDO_CON7		0x52A
+#define MT6331_SYSLDO_CON8		0x52C
+#define MT6331_SYSLDO_CON9		0x52E
+#define MT6331_SYSLDO_CON10		0x530
+#define MT6331_SYSLDO_CON11		0x532
+#define MT6331_SYSLDO_CON12		0x534
+#define MT6331_SYSLDO_CON13		0x536
+#define MT6331_SYSLDO_CON14		0x538
+#define MT6331_SYSLDO_CON15		0x53A
+#define MT6331_SYSLDO_CON16		0x53C
+#define MT6331_SYSLDO_CON17		0x53E
+#define MT6331_SYSLDO_CON18		0x540
+#define MT6331_SYSLDO_CON19		0x542
+#define MT6331_SYSLDO_CON20		0x544
+#define MT6331_SYSLDO_CON21		0x546
+#define MT6331_DIGLDO_CON0		0x548
+#define MT6331_DIGLDO_CON1		0x54A
+#define MT6331_DIGLDO_CON2		0x54C
+#define MT6331_DIGLDO_CON3		0x54E
+#define MT6331_DIGLDO_CON4		0x550
+#define MT6331_DIGLDO_CON5		0x552
+#define MT6331_DIGLDO_CON6		0x554
+#define MT6331_DIGLDO_CON7		0x556
+#define MT6331_DIGLDO_CON8		0x558
+#define MT6331_DIGLDO_CON9		0x55A
+#define MT6331_DIGLDO_CON10		0x55C
+#define MT6331_DIGLDO_CON11		0x55E
+#define MT6331_DIGLDO_CON12		0x560
+#define MT6331_DIGLDO_CON13		0x562
+#define MT6331_DIGLDO_CON14		0x564
+#define MT6331_DIGLDO_CON15		0x566
+#define MT6331_DIGLDO_CON16		0x568
+#define MT6331_DIGLDO_CON17		0x56A
+#define MT6331_DIGLDO_CON18		0x56C
+#define MT6331_DIGLDO_CON19		0x56E
+#define MT6331_DIGLDO_CON20		0x570
+#define MT6331_DIGLDO_CON21		0x572
+#define MT6331_DIGLDO_CON22		0x574
+#define MT6331_DIGLDO_CON23		0x576
+#define MT6331_DIGLDO_CON24		0x578
+#define MT6331_DIGLDO_CON25		0x57A
+#define MT6331_DIGLDO_CON26		0x57C
+#define MT6331_DIGLDO_CON27		0x57E
+#define MT6331_DIGLDO_CON28		0x580
+#define MT6331_OTP_CON0			0x600
+#define MT6331_OTP_CON1			0x602
+#define MT6331_OTP_CON2			0x604
+#define MT6331_OTP_CON3			0x606
+#define MT6331_OTP_CON4			0x608
+#define MT6331_OTP_CON5			0x60A
+#define MT6331_OTP_CON6			0x60C
+#define MT6331_OTP_CON7			0x60E
+#define MT6331_OTP_CON8			0x610
+#define MT6331_OTP_CON9			0x612
+#define MT6331_OTP_CON10		0x614
+#define MT6331_OTP_CON11		0x616
+#define MT6331_OTP_CON12		0x618
+#define MT6331_OTP_CON13		0x61A
+#define MT6331_OTP_CON14		0x61C
+#define MT6331_OTP_DOUT_0_15		0x61E
+#define MT6331_OTP_DOUT_16_31		0x620
+#define MT6331_OTP_DOUT_32_47		0x622
+#define MT6331_OTP_DOUT_48_63		0x624
+#define MT6331_OTP_DOUT_64_79		0x626
+#define MT6331_OTP_DOUT_80_95		0x628
+#define MT6331_OTP_DOUT_96_111		0x62A
+#define MT6331_OTP_DOUT_112_127		0x62C
+#define MT6331_OTP_DOUT_128_143		0x62E
+#define MT6331_OTP_DOUT_144_159		0x630
+#define MT6331_OTP_DOUT_160_175		0x632
+#define MT6331_OTP_DOUT_176_191		0x634
+#define MT6331_OTP_DOUT_192_207		0x636
+#define MT6331_OTP_DOUT_208_223		0x638
+#define MT6331_OTP_DOUT_224_239		0x63A
+#define MT6331_OTP_DOUT_240_255		0x63C
+#define MT6331_OTP_VAL_0_15		0x63E
+#define MT6331_OTP_VAL_16_31		0x640
+#define MT6331_OTP_VAL_32_47		0x642
+#define MT6331_OTP_VAL_48_63		0x644
+#define MT6331_OTP_VAL_64_79		0x646
+#define MT6331_OTP_VAL_80_95		0x648
+#define MT6331_OTP_VAL_96_111		0x64A
+#define MT6331_OTP_VAL_112_127		0x64C
+#define MT6331_OTP_VAL_128_143		0x64E
+#define MT6331_OTP_VAL_144_159		0x650
+#define MT6331_OTP_VAL_160_175		0x652
+#define MT6331_OTP_VAL_176_191		0x654
+#define MT6331_OTP_VAL_192_207		0x656
+#define MT6331_OTP_VAL_208_223		0x658
+#define MT6331_OTP_VAL_224_239		0x65A
+#define MT6331_OTP_VAL_240_255		0x65C
+#define MT6331_RTC_MIX_CON0		0x65E
+#define MT6331_RTC_MIX_CON1		0x660
+#define MT6331_AUDDAC_CFG0		0x662
+#define MT6331_AUDBUF_CFG0		0x664
+#define MT6331_AUDBUF_CFG1		0x666
+#define MT6331_AUDBUF_CFG2		0x668
+#define MT6331_AUDBUF_CFG3		0x66A
+#define MT6331_AUDBUF_CFG4		0x66C
+#define MT6331_AUDBUF_CFG5		0x66E
+#define MT6331_AUDBUF_CFG6		0x670
+#define MT6331_AUDBUF_CFG7		0x672
+#define MT6331_AUDBUF_CFG8		0x674
+#define MT6331_IBIASDIST_CFG0		0x676
+#define MT6331_AUDCLKGEN_CFG0		0x678
+#define MT6331_AUDLDO_CFG0		0x67A
+#define MT6331_AUDDCDC_CFG0		0x67C
+#define MT6331_AUDDCDC_CFG1		0x67E
+#define MT6331_AUDNVREGGLB_CFG0		0x680
+#define MT6331_AUD_NCP0			0x682
+#define MT6331_AUD_ZCD_CFG0		0x684
+#define MT6331_AUDPREAMP_CFG0		0x686
+#define MT6331_AUDPREAMP_CFG1		0x688
+#define MT6331_AUDPREAMP_CFG2		0x68A
+#define MT6331_AUDADC_CFG0		0x68C
+#define MT6331_AUDADC_CFG1		0x68E
+#define MT6331_AUDADC_CFG2		0x690
+#define MT6331_AUDADC_CFG3		0x692
+#define MT6331_AUDADC_CFG4		0x694
+#define MT6331_AUDADC_CFG5		0x696
+#define MT6331_AUDDIGMI_CFG0		0x698
+#define MT6331_AUDDIGMI_CFG1		0x69A
+#define MT6331_AUDMICBIAS_CFG0		0x69C
+#define MT6331_AUDMICBIAS_CFG1		0x69E
+#define MT6331_AUDENCSPARE_CFG0		0x6A0
+#define MT6331_AUDPREAMPGAIN_CFG0	0x6A2
+#define MT6331_AUDMADPLL_CFG0		0x6A4
+#define MT6331_AUDMADPLL_CFG1		0x6A6
+#define MT6331_AUDMADPLL_CFG2		0x6A8
+#define MT6331_AUDLDO_NVREG_CFG0	0x6AA
+#define MT6331_AUDLDO_NVREG_CFG1	0x6AC
+#define MT6331_AUDLDO_NVREG_CFG2	0x6AE
+#define MT6331_AUXADC_ADC0		0x700
+#define MT6331_AUXADC_ADC1		0x702
+#define MT6331_AUXADC_ADC2		0x704
+#define MT6331_AUXADC_ADC3		0x706
+#define MT6331_AUXADC_ADC4		0x708
+#define MT6331_AUXADC_ADC5		0x70A
+#define MT6331_AUXADC_ADC6		0x70C
+#define MT6331_AUXADC_ADC7		0x70E
+#define MT6331_AUXADC_ADC8		0x710
+#define MT6331_AUXADC_ADC9		0x712
+#define MT6331_AUXADC_ADC10		0x714
+#define MT6331_AUXADC_ADC11		0x716
+#define MT6331_AUXADC_ADC12		0x718
+#define MT6331_AUXADC_ADC13		0x71A
+#define MT6331_AUXADC_ADC14		0x71C
+#define MT6331_AUXADC_ADC15		0x71E
+#define MT6331_AUXADC_ADC16		0x720
+#define MT6331_AUXADC_ADC17		0x722
+#define MT6331_AUXADC_ADC18		0x724
+#define MT6331_AUXADC_ADC19		0x726
+#define MT6331_AUXADC_STA0		0x728
+#define MT6331_AUXADC_STA1		0x72A
+#define MT6331_AUXADC_RQST0		0x72C
+#define MT6331_AUXADC_RQST0_SET		0x72E
+#define MT6331_AUXADC_RQST0_CLR		0x730
+#define MT6331_AUXADC_RQST1		0x732
+#define MT6331_AUXADC_RQST1_SET		0x734
+#define MT6331_AUXADC_RQST1_CLR		0x736
+#define MT6331_AUXADC_CON0		0x738
+#define MT6331_AUXADC_CON1		0x73A
+#define MT6331_AUXADC_CON2		0x73C
+#define MT6331_AUXADC_CON3		0x73E
+#define MT6331_AUXADC_CON4		0x740
+#define MT6331_AUXADC_CON5		0x742
+#define MT6331_AUXADC_CON6		0x744
+#define MT6331_AUXADC_CON7		0x746
+#define MT6331_AUXADC_CON8		0x748
+#define MT6331_AUXADC_CON9		0x74A
+#define MT6331_AUXADC_CON10		0x74C
+#define MT6331_AUXADC_CON11		0x74E
+#define MT6331_AUXADC_CON12		0x750
+#define MT6331_AUXADC_CON13		0x752
+#define MT6331_AUXADC_CON14		0x754
+#define MT6331_AUXADC_CON15		0x756
+#define MT6331_AUXADC_CON16		0x758
+#define MT6331_AUXADC_CON17		0x75A
+#define MT6331_AUXADC_CON18		0x75C
+#define MT6331_AUXADC_CON19		0x75E
+#define MT6331_AUXADC_CON20		0x760
+#define MT6331_AUXADC_CON21		0x762
+#define MT6331_AUXADC_CON22		0x764
+#define MT6331_AUXADC_CON23		0x766
+#define MT6331_AUXADC_CON24		0x768
+#define MT6331_AUXADC_CON25		0x76A
+#define MT6331_AUXADC_CON26		0x76C
+#define MT6331_AUXADC_CON27		0x76E
+#define MT6331_AUXADC_CON28		0x770
+#define MT6331_AUXADC_CON29		0x772
+#define MT6331_AUXADC_CON30		0x774
+#define MT6331_AUXADC_CON31		0x776
+#define MT6331_AUXADC_CON32		0x778
+#define MT6331_ACCDET_CON0		0x77A
+#define MT6331_ACCDET_CON1		0x77C
+#define MT6331_ACCDET_CON2		0x77E
+#define MT6331_ACCDET_CON3		0x780
+#define MT6331_ACCDET_CON4		0x782
+#define MT6331_ACCDET_CON5		0x784
+#define MT6331_ACCDET_CON6		0x786
+#define MT6331_ACCDET_CON7		0x788
+#define MT6331_ACCDET_CON8		0x78A
+#define MT6331_ACCDET_CON9		0x78C
+#define MT6331_ACCDET_CON10		0x78E
+#define MT6331_ACCDET_CON11		0x790
+#define MT6331_ACCDET_CON12		0x792
+#define MT6331_ACCDET_CON13		0x794
+#define MT6331_ACCDET_CON14		0x796
+#define MT6331_ACCDET_CON15		0x798
+#define MT6331_ACCDET_CON16		0x79A
+#define MT6331_ACCDET_CON17		0x79C
+#define MT6331_ACCDET_CON18		0x79E
+#define MT6331_ACCDET_CON19		0x7A0
+#define MT6331_ACCDET_CON20		0x7A2
+#define MT6331_ACCDET_CON21		0x7A4
+#define MT6331_ACCDET_CON22		0x7A6
+#define MT6331_ACCDET_CON23		0x7A8
+#define MT6331_ACCDET_CON24		0x7AA
+
+#endif /* __MFD_MT6331_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6332/core.h b/include/linux/mfd/mt6332/core.h
new file mode 100644
index 000000000000..cd6013eb82d9
--- /dev/null
+++ b/include/linux/mfd/mt6332/core.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6332_CORE_H__
+#define __MFD_MT6332_CORE_H__
+
+enum mt6332_irq_status_numbers {
+	MT6332_IRQ_STATUS_CHR_COMPLETE = 0,
+	MT6332_IRQ_STATUS_THERMAL_SD,
+	MT6332_IRQ_STATUS_THERMAL_REG_IN,
+	MT6332_IRQ_STATUS_THERMAL_REG_OUT,
+	MT6332_IRQ_STATUS_OTG_OC,
+	MT6332_IRQ_STATUS_CHR_OC,
+	MT6332_IRQ_STATUS_OTG_THERMAL,
+	MT6332_IRQ_STATUS_CHRIN_SHORT,
+	MT6332_IRQ_STATUS_DRVCDT_SHORT,
+	MT6332_IRQ_STATUS_PLUG_IN_FLASH,
+	MT6332_IRQ_STATUS_CHRWDT_FLAG,
+	MT6332_IRQ_STATUS_FLASH_EN_TIMEOUT,
+	MT6332_IRQ_STATUS_FLASH_VLED1_SHORT,
+	MT6332_IRQ_STATUS_FLASH_VLED1_OPEN = 13,
+	MT6332_IRQ_STATUS_OV = 16,
+	MT6332_IRQ_STATUS_BVALID_DET,
+	MT6332_IRQ_STATUS_VBATON_UNDET,
+	MT6332_IRQ_STATUS_CHR_PLUG_IN,
+	MT6332_IRQ_STATUS_CHR_PLUG_OUT,
+	MT6332_IRQ_STATUS_BC11_TIMEOUT,
+	MT6332_IRQ_STATUS_FLASH_VLED2_SHORT,
+	MT6332_IRQ_STATUS_FLASH_VLED2_OPEN = 23,
+	MT6332_IRQ_STATUS_THR_H = 32,
+	MT6332_IRQ_STATUS_THR_L,
+	MT6332_IRQ_STATUS_BAT_H,
+	MT6332_IRQ_STATUS_BAT_L,
+	MT6332_IRQ_STATUS_M3_H,
+	MT6332_IRQ_STATUS_M3_L,
+	MT6332_IRQ_STATUS_FG_BAT_H,
+	MT6332_IRQ_STATUS_FG_BAT_L,
+	MT6332_IRQ_STATUS_FG_CUR_H,
+	MT6332_IRQ_STATUS_FG_CUR_L,
+	MT6332_IRQ_STATUS_SPKL_D,
+	MT6332_IRQ_STATUS_SPKL_AB,
+	MT6332_IRQ_STATUS_BIF,
+	MT6332_IRQ_STATUS_VWLED_OC = 45,
+	MT6332_IRQ_STATUS_VDRAM_OC = 48,
+	MT6332_IRQ_STATUS_VDVFS2_OC,
+	MT6332_IRQ_STATUS_VRF1_OC,
+	MT6332_IRQ_STATUS_VRF2_OC,
+	MT6332_IRQ_STATUS_VPA_OC,
+	MT6332_IRQ_STATUS_VSBST_OC,
+	MT6332_IRQ_STATUS_LDO_OC,
+	MT6332_IRQ_STATUS_NR,
+};
+
+#define MT6332_IRQ_CON0_BASE	MT6332_IRQ_STATUS_CHR_COMPLETE
+#define MT6332_IRQ_CON0_BITS	(MT6332_IRQ_STATUS_FLASH_VLED1_OPEN + 1)
+#define MT6332_IRQ_CON1_BASE	MT6332_IRQ_STATUS_OV
+#define MT6332_IRQ_CON1_BITS	(MT6332_IRQ_STATUS_FLASH_VLED2_OPEN - MT6332_IRQ_STATUS_OV + 1)
+#define MT6332_IRQ_CON2_BASE	MT6332_IRQ_STATUS_THR_H
+#define MT6332_IRQ_CON2_BITS	(MT6332_IRQ_STATUS_VWLED_OC - MT6332_IRQ_STATUS_THR_H + 1)
+#define MT6332_IRQ_CON3_BASE	MT6332_IRQ_STATUS_VDRAM_OC
+#define MT6332_IRQ_CON3_BITS	(MT6332_IRQ_STATUS_LDO_OC - MT6332_IRQ_STATUS_VDRAM_OC + 1)
+
+#endif /* __MFD_MT6332_CORE_H__ */
diff --git a/include/linux/mfd/mt6332/registers.h b/include/linux/mfd/mt6332/registers.h
new file mode 100644
index 000000000000..65e0b86fceac
--- /dev/null
+++ b/include/linux/mfd/mt6332/registers.h
@@ -0,0 +1,642 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ */
+
+#ifndef __MFD_MT6332_REGISTERS_H__
+#define __MFD_MT6332_REGISTERS_H__
+
+/* PMIC Registers */
+#define MT6332_HWCID              0x8000
+#define MT6332_SWCID              0x8002
+#define MT6332_TOP_CON            0x8004
+#define MT6332_DDR_VREF_AP_CON    0x8006
+#define MT6332_DDR_VREF_DQ_CON    0x8008
+#define MT6332_DDR_VREF_CA_CON    0x800A
+#define MT6332_TEST_OUT           0x800C
+#define MT6332_TEST_CON0          0x800E
+#define MT6332_TEST_CON1          0x8010
+#define MT6332_TESTMODE_SW        0x8012
+#define MT6332_TESTMODE_ANA       0x8014
+#define MT6332_TDSEL_CON          0x8016
+#define MT6332_RDSEL_CON          0x8018
+#define MT6332_SMT_CON0           0x801A
+#define MT6332_SMT_CON1           0x801C
+#define MT6332_DRV_CON0           0x801E
+#define MT6332_DRV_CON1           0x8020
+#define MT6332_DRV_CON2           0x8022
+#define MT6332_EN_STATUS0         0x8024
+#define MT6332_OCSTATUS0          0x8026
+#define MT6332_TOP_STATUS         0x8028
+#define MT6332_TOP_STATUS_SET     0x802A
+#define MT6332_TOP_STATUS_CLR     0x802C
+#define MT6332_FLASH_CON0         0x802E
+#define MT6332_FLASH_CON1         0x8030
+#define MT6332_FLASH_CON2         0x8032
+#define MT6332_CORE_CON0          0x8034
+#define MT6332_CORE_CON1          0x8036
+#define MT6332_CORE_CON2          0x8038
+#define MT6332_CORE_CON3          0x803A
+#define MT6332_CORE_CON4          0x803C
+#define MT6332_CORE_CON5          0x803E
+#define MT6332_CORE_CON6          0x8040
+#define MT6332_CORE_CON7          0x8042
+#define MT6332_CORE_CON8          0x8044
+#define MT6332_CORE_CON9          0x8046
+#define MT6332_CORE_CON10         0x8048
+#define MT6332_CORE_CON11         0x804A
+#define MT6332_CORE_CON12         0x804C
+#define MT6332_CORE_CON13         0x804E
+#define MT6332_CORE_CON14         0x8050
+#define MT6332_CORE_CON15         0x8052
+#define MT6332_STA_CON0           0x8054
+#define MT6332_STA_CON1           0x8056
+#define MT6332_STA_CON2           0x8058
+#define MT6332_STA_CON3           0x805A
+#define MT6332_STA_CON4           0x805C
+#define MT6332_STA_CON5           0x805E
+#define MT6332_STA_CON6           0x8060
+#define MT6332_STA_CON7           0x8062
+#define MT6332_CHR_CON0           0x8064
+#define MT6332_CHR_CON1           0x8066
+#define MT6332_CHR_CON2           0x8068
+#define MT6332_CHR_CON3           0x806A
+#define MT6332_CHR_CON4           0x806C
+#define MT6332_CHR_CON5           0x806E
+#define MT6332_CHR_CON6           0x8070
+#define MT6332_CHR_CON7           0x8072
+#define MT6332_CHR_CON8           0x8074
+#define MT6332_CHR_CON9           0x8076
+#define MT6332_CHR_CON10          0x8078
+#define MT6332_CHR_CON11          0x807A
+#define MT6332_CHR_CON12          0x807C
+#define MT6332_CHR_CON13          0x807E
+#define MT6332_CHR_CON14          0x8080
+#define MT6332_CHR_CON15          0x8082
+#define MT6332_BOOST_CON0         0x8084
+#define MT6332_BOOST_CON1         0x8086
+#define MT6332_BOOST_CON2         0x8088
+#define MT6332_BOOST_CON3         0x808A
+#define MT6332_BOOST_CON4         0x808C
+#define MT6332_BOOST_CON5         0x808E
+#define MT6332_BOOST_CON6         0x8090
+#define MT6332_BOOST_CON7         0x8092
+#define MT6332_TOP_CKPDN_CON0     0x8094
+#define MT6332_TOP_CKPDN_CON0_SET 0x8096
+#define MT6332_TOP_CKPDN_CON0_CLR 0x8098
+#define MT6332_TOP_CKPDN_CON1     0x809A
+#define MT6332_TOP_CKPDN_CON1_SET 0x809C
+#define MT6332_TOP_CKPDN_CON1_CLR 0x809E
+#define MT6332_TOP_CKPDN_CON2     0x80A0
+#define MT6332_TOP_CKPDN_CON2_SET 0x80A2
+#define MT6332_TOP_CKPDN_CON2_CLR 0x80A4
+#define MT6332_TOP_CKSEL_CON0     0x80A6
+#define MT6332_TOP_CKSEL_CON0_SET 0x80A8
+#define MT6332_TOP_CKSEL_CON0_CLR 0x80AA
+#define MT6332_TOP_CKSEL_CON1     0x80AC
+#define MT6332_TOP_CKSEL_CON1_SET 0x80AE
+#define MT6332_TOP_CKSEL_CON1_CLR 0x80B0
+#define MT6332_TOP_CKHWEN_CON     0x80B2
+#define MT6332_TOP_CKHWEN_CON_SET 0x80B4
+#define MT6332_TOP_CKHWEN_CON_CLR 0x80B6
+#define MT6332_TOP_CKTST_CON0     0x80B8
+#define MT6332_TOP_CKTST_CON1     0x80BA
+#define MT6332_TOP_RST_CON        0x80BC
+#define MT6332_TOP_RST_CON_SET    0x80BE
+#define MT6332_TOP_RST_CON_CLR    0x80C0
+#define MT6332_TOP_RST_MISC       0x80C2
+#define MT6332_TOP_RST_MISC_SET   0x80C4
+#define MT6332_TOP_RST_MISC_CLR   0x80C6
+#define MT6332_INT_CON0           0x80C8
+#define MT6332_INT_CON0_SET       0x80CA
+#define MT6332_INT_CON0_CLR       0x80CC
+#define MT6332_INT_CON1           0x80CE
+#define MT6332_INT_CON1_SET       0x80D0
+#define MT6332_INT_CON1_CLR       0x80D2
+#define MT6332_INT_CON2           0x80D4
+#define MT6332_INT_CON2_SET       0x80D6
+#define MT6332_INT_CON2_CLR       0x80D8
+#define MT6332_INT_CON3           0x80DA
+#define MT6332_INT_CON3_SET       0x80DC
+#define MT6332_INT_CON3_CLR       0x80DE
+#define MT6332_CHRWDT_CON0        0x80E0
+#define MT6332_CHRWDT_STATUS0     0x80E2
+#define MT6332_INT_STATUS0        0x80E4
+#define MT6332_INT_STATUS1        0x80E6
+#define MT6332_INT_STATUS2        0x80E8
+#define MT6332_INT_STATUS3        0x80EA
+#define MT6332_OC_GEAR_0          0x80EC
+#define MT6332_OC_GEAR_1          0x80EE
+#define MT6332_OC_GEAR_2          0x80F0
+#define MT6332_INT_MISC_CON       0x80F2
+#define MT6332_RG_SPI_CON         0x80F4
+#define MT6332_DEW_DIO_EN         0x80F6
+#define MT6332_DEW_READ_TEST      0x80F8
+#define MT6332_DEW_WRITE_TEST     0x80FA
+#define MT6332_DEW_CRC_SWRST      0x80FC
+#define MT6332_DEW_CRC_EN         0x80FE
+#define MT6332_DEW_CRC_VAL        0x8100
+#define MT6332_DEW_DBG_MON_SEL    0x8102
+#define MT6332_DEW_CIPHER_KEY_SEL 0x8104
+#define MT6332_DEW_CIPHER_IV_SEL  0x8106
+#define MT6332_DEW_CIPHER_EN      0x8108
+#define MT6332_DEW_CIPHER_RDY     0x810A
+#define MT6332_DEW_CIPHER_MODE    0x810C
+#define MT6332_DEW_CIPHER_SWRST   0x810E
+#define MT6332_DEW_RDDMY_NO       0x8110
+#define MT6332_INT_STA            0x8112
+#define MT6332_BIF_CON0           0x8114
+#define MT6332_BIF_CON1           0x8116
+#define MT6332_BIF_CON2           0x8118
+#define MT6332_BIF_CON3           0x811A
+#define MT6332_BIF_CON4           0x811C
+#define MT6332_BIF_CON5           0x811E
+#define MT6332_BIF_CON6           0x8120
+#define MT6332_BIF_CON7           0x8122
+#define MT6332_BIF_CON8           0x8124
+#define MT6332_BIF_CON9           0x8126
+#define MT6332_BIF_CON10          0x8128
+#define MT6332_BIF_CON11          0x812A
+#define MT6332_BIF_CON12          0x812C
+#define MT6332_BIF_CON13          0x812E
+#define MT6332_BIF_CON14          0x8130
+#define MT6332_BIF_CON15          0x8132
+#define MT6332_BIF_CON16          0x8134
+#define MT6332_BIF_CON17          0x8136
+#define MT6332_BIF_CON18          0x8138
+#define MT6332_BIF_CON19          0x813A
+#define MT6332_BIF_CON20          0x813C
+#define MT6332_BIF_CON21          0x813E
+#define MT6332_BIF_CON22          0x8140
+#define MT6332_BIF_CON23          0x8142
+#define MT6332_BIF_CON24          0x8144
+#define MT6332_BIF_CON25          0x8146
+#define MT6332_BIF_CON26          0x8148
+#define MT6332_BIF_CON27          0x814A
+#define MT6332_BIF_CON28          0x814C
+#define MT6332_BIF_CON29          0x814E
+#define MT6332_BIF_CON30          0x8150
+#define MT6332_BIF_CON31          0x8152
+#define MT6332_BIF_CON32          0x8154
+#define MT6332_BIF_CON33          0x8156
+#define MT6332_BIF_CON34          0x8158
+#define MT6332_BIF_CON35          0x815A
+#define MT6332_BIF_CON36          0x815C
+#define MT6332_BATON_CON0         0x815E
+#define MT6332_BIF_CON37          0x8160
+#define MT6332_BIF_CON38          0x8162
+#define MT6332_CHR_CON16          0x8164
+#define MT6332_CHR_CON17          0x8166
+#define MT6332_CHR_CON18          0x8168
+#define MT6332_CHR_CON19          0x816A
+#define MT6332_CHR_CON20          0x816C
+#define MT6332_CHR_CON21          0x816E
+#define MT6332_CHR_CON22          0x8170
+#define MT6332_CHR_CON23          0x8172
+#define MT6332_CHR_CON24          0x8174
+#define MT6332_CHR_CON25          0x8176
+#define MT6332_STA_CON8           0x8178
+#define MT6332_BUCK_ALL_CON0      0x8400
+#define MT6332_BUCK_ALL_CON1      0x8402
+#define MT6332_BUCK_ALL_CON2      0x8404
+#define MT6332_BUCK_ALL_CON3      0x8406
+#define MT6332_BUCK_ALL_CON4      0x8408
+#define MT6332_BUCK_ALL_CON5      0x840A
+#define MT6332_BUCK_ALL_CON6      0x840C
+#define MT6332_BUCK_ALL_CON7      0x840E
+#define MT6332_BUCK_ALL_CON8      0x8410
+#define MT6332_BUCK_ALL_CON9      0x8412
+#define MT6332_BUCK_ALL_CON10     0x8414
+#define MT6332_BUCK_ALL_CON11     0x8416
+#define MT6332_BUCK_ALL_CON12     0x8418
+#define MT6332_BUCK_ALL_CON13     0x841A
+#define MT6332_BUCK_ALL_CON14     0x841C
+#define MT6332_BUCK_ALL_CON15     0x841E
+#define MT6332_BUCK_ALL_CON16     0x8420
+#define MT6332_BUCK_ALL_CON17     0x8422
+#define MT6332_BUCK_ALL_CON18     0x8424
+#define MT6332_BUCK_ALL_CON19     0x8426
+#define MT6332_BUCK_ALL_CON20     0x8428
+#define MT6332_BUCK_ALL_CON21     0x842A
+#define MT6332_BUCK_ALL_CON22     0x842C
+#define MT6332_BUCK_ALL_CON23     0x842E
+#define MT6332_BUCK_ALL_CON24     0x8430
+#define MT6332_BUCK_ALL_CON25     0x8432
+#define MT6332_BUCK_ALL_CON26     0x8434
+#define MT6332_BUCK_ALL_CON27     0x8436
+#define MT6332_VDRAM_CON0         0x8438
+#define MT6332_VDRAM_CON1         0x843A
+#define MT6332_VDRAM_CON2         0x843C
+#define MT6332_VDRAM_CON3         0x843E
+#define MT6332_VDRAM_CON4         0x8440
+#define MT6332_VDRAM_CON5         0x8442
+#define MT6332_VDRAM_CON6         0x8444
+#define MT6332_VDRAM_CON7         0x8446
+#define MT6332_VDRAM_CON8         0x8448
+#define MT6332_VDRAM_CON9         0x844A
+#define MT6332_VDRAM_CON10        0x844C
+#define MT6332_VDRAM_CON11        0x844E
+#define MT6332_VDRAM_CON12        0x8450
+#define MT6332_VDRAM_CON13        0x8452
+#define MT6332_VDRAM_CON14        0x8454
+#define MT6332_VDRAM_CON15        0x8456
+#define MT6332_VDRAM_CON16        0x8458
+#define MT6332_VDRAM_CON17        0x845A
+#define MT6332_VDRAM_CON18        0x845C
+#define MT6332_VDRAM_CON19        0x845E
+#define MT6332_VDRAM_CON20        0x8460
+#define MT6332_VDRAM_CON21        0x8462
+#define MT6332_VDVFS2_CON0        0x8464
+#define MT6332_VDVFS2_CON1        0x8466
+#define MT6332_VDVFS2_CON2        0x8468
+#define MT6332_VDVFS2_CON3        0x846A
+#define MT6332_VDVFS2_CON4        0x846C
+#define MT6332_VDVFS2_CON5        0x846E
+#define MT6332_VDVFS2_CON6        0x8470
+#define MT6332_VDVFS2_CON7        0x8472
+#define MT6332_VDVFS2_CON8        0x8474
+#define MT6332_VDVFS2_CON9        0x8476
+#define MT6332_VDVFS2_CON10       0x8478
+#define MT6332_VDVFS2_CON11       0x847A
+#define MT6332_VDVFS2_CON12       0x847C
+#define MT6332_VDVFS2_CON13       0x847E
+#define MT6332_VDVFS2_CON14       0x8480
+#define MT6332_VDVFS2_CON15       0x8482
+#define MT6332_VDVFS2_CON16       0x8484
+#define MT6332_VDVFS2_CON17       0x8486
+#define MT6332_VDVFS2_CON18       0x8488
+#define MT6332_VDVFS2_CON19       0x848A
+#define MT6332_VDVFS2_CON20       0x848C
+#define MT6332_VDVFS2_CON21       0x848E
+#define MT6332_VDVFS2_CON22       0x8490
+#define MT6332_VDVFS2_CON23       0x8492
+#define MT6332_VDVFS2_CON24       0x8494
+#define MT6332_VDVFS2_CON25       0x8496
+#define MT6332_VDVFS2_CON26       0x8498
+#define MT6332_VDVFS2_CON27       0x849A
+#define MT6332_VRF1_CON0          0x849C
+#define MT6332_VRF1_CON1          0x849E
+#define MT6332_VRF1_CON2          0x84A0
+#define MT6332_VRF1_CON3          0x84A2
+#define MT6332_VRF1_CON4          0x84A4
+#define MT6332_VRF1_CON5          0x84A6
+#define MT6332_VRF1_CON6          0x84A8
+#define MT6332_VRF1_CON7          0x84AA
+#define MT6332_VRF1_CON8          0x84AC
+#define MT6332_VRF1_CON9          0x84AE
+#define MT6332_VRF1_CON10         0x84B0
+#define MT6332_VRF1_CON11         0x84B2
+#define MT6332_VRF1_CON12         0x84B4
+#define MT6332_VRF1_CON13         0x84B6
+#define MT6332_VRF1_CON14         0x84B8
+#define MT6332_VRF1_CON15         0x84BA
+#define MT6332_VRF1_CON16         0x84BC
+#define MT6332_VRF1_CON17         0x84BE
+#define MT6332_VRF1_CON18         0x84C0
+#define MT6332_VRF1_CON19         0x84C2
+#define MT6332_VRF1_CON20         0x84C4
+#define MT6332_VRF1_CON21         0x84C6
+#define MT6332_VRF2_CON0          0x84C8
+#define MT6332_VRF2_CON1          0x84CA
+#define MT6332_VRF2_CON2          0x84CC
+#define MT6332_VRF2_CON3          0x84CE
+#define MT6332_VRF2_CON4          0x84D0
+#define MT6332_VRF2_CON5          0x84D2
+#define MT6332_VRF2_CON6          0x84D4
+#define MT6332_VRF2_CON7          0x84D6
+#define MT6332_VRF2_CON8          0x84D8
+#define MT6332_VRF2_CON9          0x84DA
+#define MT6332_VRF2_CON10         0x84DC
+#define MT6332_VRF2_CON11         0x84DE
+#define MT6332_VRF2_CON12         0x84E0
+#define MT6332_VRF2_CON13         0x84E2
+#define MT6332_VRF2_CON14         0x84E4
+#define MT6332_VRF2_CON15         0x84E6
+#define MT6332_VRF2_CON16         0x84E8
+#define MT6332_VRF2_CON17         0x84EA
+#define MT6332_VRF2_CON18         0x84EC
+#define MT6332_VRF2_CON19         0x84EE
+#define MT6332_VRF2_CON20         0x84F0
+#define MT6332_VRF2_CON21         0x84F2
+#define MT6332_VPA_CON0           0x84F4
+#define MT6332_VPA_CON1           0x84F6
+#define MT6332_VPA_CON2           0x84F8
+#define MT6332_VPA_CON3           0x84FC
+#define MT6332_VPA_CON4           0x84FE
+#define MT6332_VPA_CON5           0x8500
+#define MT6332_VPA_CON6           0x8502
+#define MT6332_VPA_CON7           0x8504
+#define MT6332_VPA_CON8           0x8506
+#define MT6332_VPA_CON9           0x8508
+#define MT6332_VPA_CON10          0x850A
+#define MT6332_VPA_CON11          0x850C
+#define MT6332_VPA_CON12          0x850E
+#define MT6332_VPA_CON13          0x8510
+#define MT6332_VPA_CON14          0x8512
+#define MT6332_VPA_CON15          0x8514
+#define MT6332_VPA_CON16          0x8516
+#define MT6332_VPA_CON17          0x8518
+#define MT6332_VPA_CON18          0x851A
+#define MT6332_VPA_CON19          0x851C
+#define MT6332_VPA_CON20          0x851E
+#define MT6332_VPA_CON21          0x8520
+#define MT6332_VPA_CON22          0x8522
+#define MT6332_VPA_CON23          0x8524
+#define MT6332_VPA_CON24          0x8526
+#define MT6332_VPA_CON25          0x8528
+#define MT6332_VSBST_CON0         0x852A
+#define MT6332_VSBST_CON1         0x852C
+#define MT6332_VSBST_CON2         0x852E
+#define MT6332_VSBST_CON3         0x8530
+#define MT6332_VSBST_CON4         0x8532
+#define MT6332_VSBST_CON5         0x8534
+#define MT6332_VSBST_CON6         0x8536
+#define MT6332_VSBST_CON7         0x8538
+#define MT6332_VSBST_CON8         0x853A
+#define MT6332_VSBST_CON9         0x853C
+#define MT6332_VSBST_CON10        0x853E
+#define MT6332_VSBST_CON11        0x8540
+#define MT6332_VSBST_CON12        0x8542
+#define MT6332_VSBST_CON13        0x8544
+#define MT6332_VSBST_CON14        0x8546
+#define MT6332_VSBST_CON15        0x8548
+#define MT6332_VSBST_CON16        0x854A
+#define MT6332_VSBST_CON17        0x854C
+#define MT6332_VSBST_CON18        0x854E
+#define MT6332_VSBST_CON19        0x8550
+#define MT6332_VSBST_CON20        0x8552
+#define MT6332_VSBST_CON21        0x8554
+#define MT6332_BUCK_K_CON0        0x8556
+#define MT6332_BUCK_K_CON1        0x8558
+#define MT6332_BUCK_K_CON2        0x855A
+#define MT6332_BUCK_K_CON3        0x855C
+#define MT6332_BUCK_K_CON4        0x855E
+#define MT6332_BUCK_K_CON5        0x8560
+#define MT6332_AUXADC_ADC0        0x8800
+#define MT6332_AUXADC_ADC1        0x8802
+#define MT6332_AUXADC_ADC2        0x8804
+#define MT6332_AUXADC_ADC3        0x8806
+#define MT6332_AUXADC_ADC4        0x8808
+#define MT6332_AUXADC_ADC5        0x880A
+#define MT6332_AUXADC_ADC6        0x880C
+#define MT6332_AUXADC_ADC7        0x880E
+#define MT6332_AUXADC_ADC8        0x8810
+#define MT6332_AUXADC_ADC9        0x8812
+#define MT6332_AUXADC_ADC10       0x8814
+#define MT6332_AUXADC_ADC11       0x8816
+#define MT6332_AUXADC_ADC12       0x8818
+#define MT6332_AUXADC_ADC13       0x881A
+#define MT6332_AUXADC_ADC14       0x881C
+#define MT6332_AUXADC_ADC15       0x881E
+#define MT6332_AUXADC_ADC16       0x8820
+#define MT6332_AUXADC_ADC17       0x8822
+#define MT6332_AUXADC_ADC18       0x8824
+#define MT6332_AUXADC_ADC19       0x8826
+#define MT6332_AUXADC_ADC20       0x8828
+#define MT6332_AUXADC_ADC21       0x882A
+#define MT6332_AUXADC_ADC22       0x882C
+#define MT6332_AUXADC_ADC23       0x882E
+#define MT6332_AUXADC_ADC24       0x8830
+#define MT6332_AUXADC_ADC25       0x8832
+#define MT6332_AUXADC_ADC26       0x8834
+#define MT6332_AUXADC_ADC27       0x8836
+#define MT6332_AUXADC_ADC28       0x8838
+#define MT6332_AUXADC_ADC29       0x883A
+#define MT6332_AUXADC_ADC30       0x883C
+#define MT6332_AUXADC_ADC31       0x883E
+#define MT6332_AUXADC_ADC32       0x8840
+#define MT6332_AUXADC_ADC33       0x8842
+#define MT6332_AUXADC_ADC34       0x8844
+#define MT6332_AUXADC_ADC35       0x8846
+#define MT6332_AUXADC_ADC36       0x8848
+#define MT6332_AUXADC_ADC37       0x884A
+#define MT6332_AUXADC_ADC38       0x884C
+#define MT6332_AUXADC_ADC39       0x884E
+#define MT6332_AUXADC_ADC40       0x8850
+#define MT6332_AUXADC_ADC41       0x8852
+#define MT6332_AUXADC_ADC42       0x8854
+#define MT6332_AUXADC_ADC43       0x8856
+#define MT6332_AUXADC_STA0        0x8858
+#define MT6332_AUXADC_STA1        0x885A
+#define MT6332_AUXADC_RQST0       0x885C
+#define MT6332_AUXADC_RQST0_SET   0x885E
+#define MT6332_AUXADC_RQST0_CLR   0x8860
+#define MT6332_AUXADC_RQST1       0x8862
+#define MT6332_AUXADC_RQST1_SET   0x8864
+#define MT6332_AUXADC_RQST1_CLR   0x8866
+#define MT6332_AUXADC_CON0        0x8868
+#define MT6332_AUXADC_CON1        0x886A
+#define MT6332_AUXADC_CON2        0x886C
+#define MT6332_AUXADC_CON3        0x886E
+#define MT6332_AUXADC_CON4        0x8870
+#define MT6332_AUXADC_CON5        0x8872
+#define MT6332_AUXADC_CON6        0x8874
+#define MT6332_AUXADC_CON7        0x8876
+#define MT6332_AUXADC_CON8        0x8878
+#define MT6332_AUXADC_CON9        0x887A
+#define MT6332_AUXADC_CON10       0x887C
+#define MT6332_AUXADC_CON11       0x887E
+#define MT6332_AUXADC_CON12       0x8880
+#define MT6332_AUXADC_CON13       0x8882
+#define MT6332_AUXADC_CON14       0x8884
+#define MT6332_AUXADC_CON15       0x8886
+#define MT6332_AUXADC_CON16       0x8888
+#define MT6332_AUXADC_CON17       0x888A
+#define MT6332_AUXADC_CON18       0x888C
+#define MT6332_AUXADC_CON19       0x888E
+#define MT6332_AUXADC_CON20       0x8890
+#define MT6332_AUXADC_CON21       0x8892
+#define MT6332_AUXADC_CON22       0x8894
+#define MT6332_AUXADC_CON23       0x8896
+#define MT6332_AUXADC_CON24       0x8898
+#define MT6332_AUXADC_CON25       0x889A
+#define MT6332_AUXADC_CON26       0x889C
+#define MT6332_AUXADC_CON27       0x889E
+#define MT6332_AUXADC_CON28       0x88A0
+#define MT6332_AUXADC_CON29       0x88A2
+#define MT6332_AUXADC_CON30       0x88A4
+#define MT6332_AUXADC_CON31       0x88A6
+#define MT6332_AUXADC_CON32       0x88A8
+#define MT6332_AUXADC_CON33       0x88AA
+#define MT6332_AUXADC_CON34       0x88AC
+#define MT6332_AUXADC_CON35       0x88AE
+#define MT6332_AUXADC_CON36       0x88B0
+#define MT6332_AUXADC_CON37       0x88B2
+#define MT6332_AUXADC_CON38       0x88B4
+#define MT6332_AUXADC_CON39       0x88B6
+#define MT6332_AUXADC_CON40       0x88B8
+#define MT6332_AUXADC_CON41       0x88BA
+#define MT6332_AUXADC_CON42       0x88BC
+#define MT6332_AUXADC_CON43       0x88BE
+#define MT6332_AUXADC_CON44       0x88C0
+#define MT6332_AUXADC_CON45       0x88C2
+#define MT6332_AUXADC_CON46       0x88C4
+#define MT6332_AUXADC_CON47       0x88C6
+#define MT6332_STRUP_CONA0        0x8C00
+#define MT6332_STRUP_CONA1        0x8C02
+#define MT6332_STRUP_CONA2        0x8C04
+#define MT6332_STRUP_CON0         0x8C06
+#define MT6332_STRUP_CON2         0x8C08
+#define MT6332_STRUP_CON3         0x8C0A
+#define MT6332_STRUP_CON4         0x8C0C
+#define MT6332_STRUP_CON5         0x8C0E
+#define MT6332_STRUP_CON6         0x8C10
+#define MT6332_STRUP_CON7         0x8C12
+#define MT6332_STRUP_CON8         0x8C14
+#define MT6332_STRUP_CON9         0x8C16
+#define MT6332_STRUP_CON10        0x8C18
+#define MT6332_STRUP_CON11        0x8C1A
+#define MT6332_STRUP_CON12        0x8C1C
+#define MT6332_STRUP_CON13        0x8C1E
+#define MT6332_STRUP_CON14        0x8C20
+#define MT6332_STRUP_CON15        0x8C22
+#define MT6332_STRUP_CON16        0x8C24
+#define MT6332_STRUP_CON17        0x8C26
+#define MT6332_FGADC_CON0         0x8C28
+#define MT6332_FGADC_CON1         0x8C2A
+#define MT6332_FGADC_CON2         0x8C2C
+#define MT6332_FGADC_CON3         0x8C2E
+#define MT6332_FGADC_CON4         0x8C30
+#define MT6332_FGADC_CON5         0x8C32
+#define MT6332_FGADC_CON6         0x8C34
+#define MT6332_FGADC_CON7         0x8C36
+#define MT6332_FGADC_CON8         0x8C38
+#define MT6332_FGADC_CON9         0x8C3A
+#define MT6332_FGADC_CON10        0x8C3C
+#define MT6332_FGADC_CON11        0x8C3E
+#define MT6332_FGADC_CON12        0x8C40
+#define MT6332_FGADC_CON13        0x8C42
+#define MT6332_FGADC_CON14        0x8C44
+#define MT6332_FGADC_CON15        0x8C46
+#define MT6332_FGADC_CON16        0x8C48
+#define MT6332_FGADC_CON17        0x8C4A
+#define MT6332_FGADC_CON18        0x8C4C
+#define MT6332_FGADC_CON19        0x8C4E
+#define MT6332_FGADC_CON20        0x8C50
+#define MT6332_FGADC_CON21        0x8C52
+#define MT6332_FGADC_CON22        0x8C54
+#define MT6332_OTP_CON0           0x8C56
+#define MT6332_OTP_CON1           0x8C58
+#define MT6332_OTP_CON2           0x8C5A
+#define MT6332_OTP_CON3           0x8C5C
+#define MT6332_OTP_CON4           0x8C5E
+#define MT6332_OTP_CON5           0x8C60
+#define MT6332_OTP_CON6           0x8C62
+#define MT6332_OTP_CON7           0x8C64
+#define MT6332_OTP_CON8           0x8C66
+#define MT6332_OTP_CON9           0x8C68
+#define MT6332_OTP_CON10          0x8C6A
+#define MT6332_OTP_CON11          0x8C6C
+#define MT6332_OTP_CON12          0x8C6E
+#define MT6332_OTP_CON13          0x8C70
+#define MT6332_OTP_CON14          0x8C72
+#define MT6332_OTP_DOUT_0_15      0x8C74
+#define MT6332_OTP_DOUT_16_31     0x8C76
+#define MT6332_OTP_DOUT_32_47     0x8C78
+#define MT6332_OTP_DOUT_48_63     0x8C7A
+#define MT6332_OTP_DOUT_64_79     0x8C7C
+#define MT6332_OTP_DOUT_80_95     0x8C7E
+#define MT6332_OTP_DOUT_96_111    0x8C80
+#define MT6332_OTP_DOUT_112_127   0x8C82
+#define MT6332_OTP_DOUT_128_143   0x8C84
+#define MT6332_OTP_DOUT_144_159   0x8C86
+#define MT6332_OTP_DOUT_160_175   0x8C88
+#define MT6332_OTP_DOUT_176_191   0x8C8A
+#define MT6332_OTP_DOUT_192_207   0x8C8C
+#define MT6332_OTP_DOUT_208_223   0x8C8E
+#define MT6332_OTP_DOUT_224_239   0x8C90
+#define MT6332_OTP_DOUT_240_255   0x8C92
+#define MT6332_OTP_VAL_0_15       0x8C94
+#define MT6332_OTP_VAL_16_31      0x8C96
+#define MT6332_OTP_VAL_32_47      0x8C98
+#define MT6332_OTP_VAL_48_63      0x8C9A
+#define MT6332_OTP_VAL_64_79      0x8C9C
+#define MT6332_OTP_VAL_80_95      0x8C9E
+#define MT6332_OTP_VAL_96_111     0x8CA0
+#define MT6332_OTP_VAL_112_127    0x8CA2
+#define MT6332_OTP_VAL_128_143    0x8CA4
+#define MT6332_OTP_VAL_144_159    0x8CA6
+#define MT6332_OTP_VAL_160_175    0x8CA8
+#define MT6332_OTP_VAL_176_191    0x8CAA
+#define MT6332_OTP_VAL_192_207    0x8CAC
+#define MT6332_OTP_VAL_208_223    0x8CAE
+#define MT6332_OTP_VAL_224_239    0x8CB0
+#define MT6332_OTP_VAL_240_255    0x8CB2
+#define MT6332_LDO_CON0           0x8CB4
+#define MT6332_LDO_CON1           0x8CB6
+#define MT6332_LDO_CON2           0x8CB8
+#define MT6332_LDO_CON3           0x8CBA
+#define MT6332_LDO_CON5           0x8CBC
+#define MT6332_LDO_CON6           0x8CBE
+#define MT6332_LDO_CON7           0x8CC0
+#define MT6332_LDO_CON8           0x8CC2
+#define MT6332_LDO_CON9           0x8CC4
+#define MT6332_LDO_CON10          0x8CC6
+#define MT6332_LDO_CON11          0x8CC8
+#define MT6332_LDO_CON12          0x8CCA
+#define MT6332_LDO_CON13          0x8CCC
+#define MT6332_FQMTR_CON0         0x8CCE
+#define MT6332_FQMTR_CON1         0x8CD0
+#define MT6332_FQMTR_CON2         0x8CD2
+#define MT6332_IWLED_CON0         0x8CD4
+#define MT6332_IWLED_DEG          0x8CD6
+#define MT6332_IWLED_STATUS       0x8CD8
+#define MT6332_IWLED_EN_CTRL      0x8CDA
+#define MT6332_IWLED_CON1         0x8CDC
+#define MT6332_IWLED_CON2         0x8CDE
+#define MT6332_IWLED_TRIM0        0x8CE0
+#define MT6332_IWLED_TRIM1        0x8CE2
+#define MT6332_IWLED_CON3         0x8CE4
+#define MT6332_IWLED_CON4         0x8CE6
+#define MT6332_IWLED_CON5         0x8CE8
+#define MT6332_IWLED_CON6         0x8CEA
+#define MT6332_IWLED_CON7         0x8CEC
+#define MT6332_IWLED_CON8         0x8CEE
+#define MT6332_IWLED_CON9         0x8CF0
+#define MT6332_SPK_CON0           0x8CF2
+#define MT6332_SPK_CON1           0x8CF4
+#define MT6332_SPK_CON2           0x8CF6
+#define MT6332_SPK_CON3           0x8CF8
+#define MT6332_SPK_CON4           0x8CFA
+#define MT6332_SPK_CON5           0x8CFC
+#define MT6332_SPK_CON6           0x8CFE
+#define MT6332_SPK_CON7           0x8D00
+#define MT6332_SPK_CON8           0x8D02
+#define MT6332_SPK_CON9           0x8D04
+#define MT6332_SPK_CON10          0x8D06
+#define MT6332_SPK_CON11          0x8D08
+#define MT6332_SPK_CON12          0x8D0A
+#define MT6332_SPK_CON13          0x8D0C
+#define MT6332_SPK_CON14          0x8D0E
+#define MT6332_SPK_CON15          0x8D10
+#define MT6332_SPK_CON16          0x8D12
+#define MT6332_TESTI_CON0         0x8D14
+#define MT6332_TESTI_CON1         0x8D16
+#define MT6332_TESTI_CON2         0x8D18
+#define MT6332_TESTI_CON3         0x8D1A
+#define MT6332_TESTI_CON4         0x8D1C
+#define MT6332_TESTI_CON5         0x8D1E
+#define MT6332_TESTI_CON6         0x8D20
+#define MT6332_TESTI_MUX_CON0     0x8D22
+#define MT6332_TESTI_MUX_CON1     0x8D24
+#define MT6332_TESTI_MUX_CON2     0x8D26
+#define MT6332_TESTI_MUX_CON3     0x8D28
+#define MT6332_TESTI_MUX_CON4     0x8D2A
+#define MT6332_TESTI_MUX_CON5     0x8D2C
+#define MT6332_TESTI_MUX_CON6     0x8D2E
+#define MT6332_TESTO_CON0         0x8D30
+#define MT6332_TESTO_CON1         0x8D32
+#define MT6332_TEST_OMUX_CON0     0x8D34
+#define MT6332_TEST_OMUX_CON1     0x8D36
+#define MT6332_DEBUG_CON0         0x8D38
+#define MT6332_DEBUG_CON1         0x8D3A
+#define MT6332_DEBUG_CON2         0x8D3C
+#define MT6332_FGADC_CON23        0x8D3E
+#define MT6332_FGADC_CON24        0x8D40
+#define MT6332_FGADC_CON25        0x8D42
+#define MT6332_TOP_RST_STATUS     0x8D44
+#define MT6332_TOP_RST_STATUS_SET 0x8D46
+#define MT6332_TOP_RST_STATUS_CLR 0x8D48
+#define MT6332_VDVFS2_CON28       0x8D4A
+
+#endif /* __MFD_MT6332_REGISTERS_H__ */
diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 3fecaffe5019..627487e26287 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h
@@ -12,6 +12,8 @@
 
 enum chip_id {
 	MT6323_CHIP_ID = 0x23,
+	MT6331_CHIP_ID = 0x20,
+	MT6332_CHIP_ID = 0x20,
 	MT6357_CHIP_ID = 0x57,
 	MT6358_CHIP_ID = 0x58,
 	MT6359_CHIP_ID = 0x59,
-- 
cgit v1.2.3


From 2b3416ceff5e6bd4922f6d1c61fb68113dd82302 Mon Sep 17 00:00:00 2001
From: Yang Xu <xuyang2018.jy@fujitsu.com>
Date: Thu, 14 Jul 2022 14:11:25 +0800
Subject: fs: add mode_strip_sgid() helper

Add a dedicated helper to handle the setgid bit when creating a new file
in a setgid directory. This is a preparatory patch for moving setgid
stripping into the vfs. The patch contains no functional changes.

Currently the setgid stripping logic is open-coded directly in
inode_init_owner() and the individual filesystems are responsible for
handling setgid inheritance. Since this has proven to be brittle as
evidenced by old issues we uncovered over the last months (see [1] to
[3] below) we will try to move this logic into the vfs.

Link: e014f37db1a2 ("xfs: use setattr_copy to set vfs inode attributes") [1]
Link: 01ea173e103e ("xfs: fix up non-directory creation in SGID directories") [2]
Link: fd84bfdddd16 ("ceph: fix up non-directory creation in SGID directories") [3]
Link: https://lore.kernel.org/r/1657779088-2242-1-git-send-email-xuyang2018.jy@fujitsu.com
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-and-Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Yang Xu <xuyang2018.jy@fujitsu.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/inode.c         | 36 ++++++++++++++++++++++++++++++++----
 include/linux/fs.h |  2 ++
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index bd4da9c5207e..71b36afc3893 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2246,10 +2246,8 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		/* Directories are special, and always inherit S_ISGID */
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
-		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-			 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
-			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
-			mode &= ~S_ISGID;
+		else
+			mode = mode_strip_sgid(mnt_userns, dir, mode);
 	} else
 		inode_fsgid_set(inode, mnt_userns);
 	inode->i_mode = mode;
@@ -2405,3 +2403,33 @@ struct timespec64 current_time(struct inode *inode)
 	return timestamp_truncate(now, inode);
 }
 EXPORT_SYMBOL(current_time);
+
+/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @mnt_userns: User namespace of the mount the inode was created from
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+ *
+ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
+ * raised and @dir has the S_ISGID bit raised ensure that the caller is
+ * either in the group of the parent directory or they have CAP_FSETID
+ * in their user namespace and are privileged over the parent directory.
+ * In all other cases, strip the S_ISGID bit from @mode.
+ *
+ * Return: the new mode to use for the file
+ */
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+			const struct inode *dir, umode_t mode)
+{
+	if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+		return mode;
+	if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+		return mode;
+	if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
+		return mode;
+	if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+		return mode;
+
+	return mode & ~S_ISGID;
+}
+EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..50642668c60f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1903,6 +1903,8 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
 void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		      const struct inode *dir, umode_t mode);
 extern bool may_open_dev(const struct path *path);
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+			const struct inode *dir, umode_t mode);
 
 /*
  * This is the "filldir" function type, used by readdir() to let
-- 
cgit v1.2.3


From e6b8a0a5e7f688e092d1c639d438ccd0b323213c Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Jul 2022 13:52:44 -0700
Subject: PCI: Add vendor ID for the PCI SIG

This ID is used in DOE headers to identify protocols that are defined
within the PCI Express Base Specification, PCIe r6.0, sec 6.30.1.1 table
6-32.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20220719205249.566684-2-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0178823ce8c2..8af3b86206b1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -151,6 +151,7 @@
 #define PCI_CLASS_OTHERS		0xff
 
 /* Vendors and devices.  Sort key: vendor first, device next. */
+#define PCI_VENDOR_ID_PCI_SIG		0x0001
 
 #define PCI_VENDOR_ID_LOONGSON		0x0014
 
-- 
cgit v1.2.3


From 9d24322e887b6a3d3f9f9c3e76937a646102c8c1 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Tue, 19 Jul 2022 13:52:46 -0700
Subject: PCI/DOE: Add DOE mailbox support functions

Introduced in a PCIe r6.0, sec 6.30, DOE provides a config space based
mailbox with standard protocol discovery.  Each mailbox is accessed
through a DOE Extended Capability.

Each DOE mailbox must support the DOE discovery protocol in addition to
any number of additional protocols.

Define core PCIe functionality to manage a single PCIe DOE mailbox at a
defined config space offset.  Functionality includes iterating,
creating, query of supported protocol, and task submission.  Destruction
of the mailboxes is device managed.

Cc: "Li, Ming" <ming4.li@intel.com>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Acked-by: Bjorn Helgaas <helgaas@kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Co-developed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20220719205249.566684-4-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 .clang-format                 |   1 +
 drivers/pci/Kconfig           |   3 +
 drivers/pci/Makefile          |   1 +
 drivers/pci/doe.c             | 536 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci-doe.h       |  77 ++++++
 include/uapi/linux/pci_regs.h |  29 ++-
 6 files changed, 646 insertions(+), 1 deletion(-)
 create mode 100644 drivers/pci/doe.c
 create mode 100644 include/linux/pci-doe.h

(limited to 'include/linux')

diff --git a/.clang-format b/.clang-format
index 9b87ea1fc16e..1247d54f9e49 100644
--- a/.clang-format
+++ b/.clang-format
@@ -516,6 +516,7 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
+  - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
   - 'pcm_for_each_format'
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 133c73207782..b2f2e588a817 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -121,6 +121,9 @@ config XEN_PCIDEV_FRONTEND
 config PCI_ATS
 	bool
 
+config PCI_DOE
+	bool
+
 config PCI_ECAM
 	bool
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0da6b1ebc694..2680e4c92f0a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_PCI_ECAM)		+= ecam.o
 obj-$(CONFIG_PCI_P2PDMA)	+= p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
+obj-$(CONFIG_PCI_DOE)		+= doe.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
new file mode 100644
index 000000000000..e402f05068a5
--- /dev/null
+++ b/drivers/pci/doe.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *	Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#define dev_fmt(fmt) "DOE: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/workqueue.h>
+
+#define PCI_DOE_PROTOCOL_DISCOVERY 0
+
+/* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
+#define PCI_DOE_TIMEOUT HZ
+#define PCI_DOE_POLL_INTERVAL	(PCI_DOE_TIMEOUT / 128)
+
+#define PCI_DOE_FLAG_CANCEL	0
+#define PCI_DOE_FLAG_DEAD	1
+
+/**
+ * struct pci_doe_mb - State for a single DOE mailbox
+ *
+ * This state is used to manage a single DOE mailbox capability.  All fields
+ * should be considered opaque to the consumers and the structure passed into
+ * the helpers below after being created by devm_pci_doe_create()
+ *
+ * @pdev: PCI device this mailbox belongs to
+ * @cap_offset: Capability offset
+ * @prots: Array of protocols supported (encoded as long values)
+ * @wq: Wait queue for work item
+ * @work_queue: Queue of pci_doe_work items
+ * @flags: Bit array of PCI_DOE_FLAG_* flags
+ */
+struct pci_doe_mb {
+	struct pci_dev *pdev;
+	u16 cap_offset;
+	struct xarray prots;
+
+	wait_queue_head_t wq;
+	struct workqueue_struct *work_queue;
+	unsigned long flags;
+};
+
+static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
+{
+	if (wait_event_timeout(doe_mb->wq,
+			       test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags),
+			       timeout))
+		return -EIO;
+	return 0;
+}
+
+static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val);
+}
+
+static int pci_doe_abort(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+
+	pci_dbg(pdev, "[%x] Issuing Abort\n", offset);
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_ABORT);
+
+	do {
+		int rc;
+		u32 val;
+
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc)
+			return rc;
+		pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+
+		/* Abort success! */
+		if (!FIELD_GET(PCI_DOE_STATUS_ERROR, val) &&
+		    !FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+			return 0;
+
+	} while (!time_after(jiffies, timeout_jiffies));
+
+	/* Abort has timed out and the MB is dead */
+	pci_err(pdev, "[%x] ABORT timed out\n", offset);
+	return -EIO;
+}
+
+static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
+			    struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+	int i;
+
+	/*
+	 * Check the DOE busy bit is not set. If it is set, this could indicate
+	 * someone other than Linux (e.g. firmware) is using the mailbox. Note
+	 * it is expected that firmware and OS will negotiate access rights via
+	 * an, as yet to be defined, method.
+	 */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+		return -EBUSY;
+
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	/* Write DOE Header */
+	val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) |
+		FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type);
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val);
+	/* Length is 2 DW of header + length of payload in DW */
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+			       FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH,
+					  2 + task->request_pl_sz /
+						sizeof(u32)));
+	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
+		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+				       task->request_pl[i]);
+
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
+
+	return 0;
+}
+
+static bool pci_doe_data_obj_ready(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val))
+		return true;
+	return false;
+}
+
+static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	size_t length, payload_length;
+	u32 val;
+	int i;
+
+	/* Read the first dword to get the protocol */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	if ((FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val) != task->prot.vid) ||
+	    (FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val) != task->prot.type)) {
+		dev_err_ratelimited(&pdev->dev, "[%x] expected [VID, Protocol] = [%04x, %02x], got [%04x, %02x]\n",
+				    doe_mb->cap_offset, task->prot.vid, task->prot.type,
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val),
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val));
+		return -EIO;
+	}
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	/* Read the second dword to get the length */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+
+	length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val);
+	if (length > SZ_1M || length < 2)
+		return -EIO;
+
+	/* First 2 dwords have already been read */
+	length -= 2;
+	payload_length = min(length, task->response_pl_sz / sizeof(u32));
+	/* Read the rest of the response payload */
+	for (i = 0; i < payload_length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
+				      &task->response_pl[i]);
+		/* Prior to the last ack, ensure Data Object Ready */
+		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
+			return -EIO;
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Flush excess length */
+	for (; i < length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Final error check to pick up on any since Data Object Ready */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32);
+}
+
+static void signal_task_complete(struct pci_doe_task *task, int rv)
+{
+	task->rv = rv;
+	task->complete(task);
+}
+
+static void signal_task_abort(struct pci_doe_task *task, int rv)
+{
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+
+	if (pci_doe_abort(doe_mb)) {
+		/*
+		 * If the device can't process an abort; set the mailbox dead
+		 *	- no more submissions
+		 */
+		pci_err(pdev, "[%x] Abort failed marking mailbox dead\n",
+			doe_mb->cap_offset);
+		set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+	}
+	signal_task_complete(task, rv);
+}
+
+static void doe_statemachine_work(struct work_struct *work)
+{
+	struct pci_doe_task *task = container_of(work, struct pci_doe_task,
+						 work);
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+	u32 val;
+	int rc;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
+		signal_task_complete(task, -EIO);
+		return;
+	}
+
+	/* Send request */
+	rc = pci_doe_send_req(doe_mb, task);
+	if (rc) {
+		/*
+		 * The specification does not provide any guidance on how to
+		 * resolve conflicting requests from other entities.
+		 * Furthermore, it is likely that busy will not be detected
+		 * most of the time.  Flag any detection of status busy with an
+		 * error.
+		 */
+		if (rc == -EBUSY)
+			dev_err_ratelimited(&pdev->dev, "[%x] busy detected; another entity is sending conflicting requests\n",
+					    offset);
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	/* Poll for response */
+retry_resp:
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) {
+		signal_task_abort(task, -EIO);
+		return;
+	}
+
+	if (!FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) {
+		if (time_after(jiffies, timeout_jiffies)) {
+			signal_task_abort(task, -EIO);
+			return;
+		}
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc) {
+			signal_task_abort(task, rc);
+			return;
+		}
+		goto retry_resp;
+	}
+
+	rc  = pci_doe_recv_resp(doe_mb, task);
+	if (rc < 0) {
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	signal_task_complete(task, rc);
+}
+
+static void pci_doe_task_complete(struct pci_doe_task *task)
+{
+	complete(task->private);
+}
+
+static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
+			     u8 *protocol)
+{
+	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
+				    *index);
+	u32 response_pl;
+	DECLARE_COMPLETION_ONSTACK(c);
+	struct pci_doe_task task = {
+		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
+		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
+		.request_pl = &request_pl,
+		.request_pl_sz = sizeof(request_pl),
+		.response_pl = &response_pl,
+		.response_pl_sz = sizeof(response_pl),
+		.complete = pci_doe_task_complete,
+		.private = &c,
+	};
+	int rc;
+
+	rc = pci_doe_submit_task(doe_mb, &task);
+	if (rc < 0)
+		return rc;
+
+	wait_for_completion(&c);
+
+	if (task.rv != sizeof(response_pl))
+		return -EIO;
+
+	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
+	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
+			      response_pl);
+	*index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX,
+			   response_pl);
+
+	return 0;
+}
+
+static void *pci_doe_xa_prot_entry(u16 vid, u8 prot)
+{
+	return xa_mk_value((vid << 8) | prot);
+}
+
+static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb)
+{
+	u8 index = 0;
+	u8 xa_idx = 0;
+
+	do {
+		int rc;
+		u16 vid;
+		u8 prot;
+
+		rc = pci_doe_discovery(doe_mb, &index, &vid, &prot);
+		if (rc)
+			return rc;
+
+		pci_dbg(doe_mb->pdev,
+			"[%x] Found protocol %d vid: %x prot: %x\n",
+			doe_mb->cap_offset, xa_idx, vid, prot);
+
+		rc = xa_insert(&doe_mb->prots, xa_idx++,
+			       pci_doe_xa_prot_entry(vid, prot), GFP_KERNEL);
+		if (rc)
+			return rc;
+	} while (index);
+
+	return 0;
+}
+
+static void pci_doe_xa_destroy(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	xa_destroy(&doe_mb->prots);
+}
+
+static void pci_doe_destroy_workqueue(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	destroy_workqueue(doe_mb->work_queue);
+}
+
+static void pci_doe_flush_mb(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	/* Stop all pending work items from starting */
+	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+
+	/* Cancel an in progress work item, if necessary */
+	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
+	wake_up(&doe_mb->wq);
+
+	/* Flush all work items */
+	flush_workqueue(doe_mb->work_queue);
+}
+
+/**
+ * pcim_doe_create_mb() - Create a DOE mailbox object
+ *
+ * @pdev: PCI device to create the DOE mailbox for
+ * @cap_offset: Offset of the DOE mailbox
+ *
+ * Create a single mailbox object to manage the mailbox protocol at the
+ * cap_offset specified.
+ *
+ * RETURNS: created mailbox object on success
+ *	    ERR_PTR(-errno) on failure
+ */
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset)
+{
+	struct pci_doe_mb *doe_mb;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	doe_mb = devm_kzalloc(dev, sizeof(*doe_mb), GFP_KERNEL);
+	if (!doe_mb)
+		return ERR_PTR(-ENOMEM);
+
+	doe_mb->pdev = pdev;
+	doe_mb->cap_offset = cap_offset;
+	init_waitqueue_head(&doe_mb->wq);
+
+	xa_init(&doe_mb->prots);
+	rc = devm_add_action(dev, pci_doe_xa_destroy, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0,
+						dev_driver_string(&pdev->dev),
+						pci_name(pdev),
+						doe_mb->cap_offset);
+	if (!doe_mb->work_queue) {
+		pci_err(pdev, "[%x] failed to allocate work queue\n",
+			doe_mb->cap_offset);
+		return ERR_PTR(-ENOMEM);
+	}
+	rc = devm_add_action_or_reset(dev, pci_doe_destroy_workqueue, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* Reset the mailbox by issuing an abort */
+	rc = pci_doe_abort(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to reset mailbox with abort command : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The state machine and the mailbox should be in sync now;
+	 * Set up mailbox flush prior to using the mailbox to query protocols.
+	 */
+	rc = devm_add_action_or_reset(dev, pci_doe_flush_mb, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = pci_doe_cache_protocols(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to cache protocols : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	return doe_mb;
+}
+EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
+
+/**
+ * pci_doe_supports_prot() - Return if the DOE instance supports the given
+ *			     protocol
+ * @doe_mb: DOE mailbox capability to query
+ * @vid: Protocol Vendor ID
+ * @type: Protocol type
+ *
+ * RETURNS: True if the DOE mailbox supports the protocol specified
+ */
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
+{
+	unsigned long index;
+	void *entry;
+
+	/* The discovery protocol must always be supported */
+	if (vid == PCI_VENDOR_ID_PCI_SIG && type == PCI_DOE_PROTOCOL_DISCOVERY)
+		return true;
+
+	xa_for_each(&doe_mb->prots, index, entry)
+		if (entry == pci_doe_xa_prot_entry(vid, type))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
+
+/**
+ * pci_doe_submit_task() - Submit a task to be processed by the state machine
+ *
+ * @doe_mb: DOE mailbox capability to submit to
+ * @task: task to be queued
+ *
+ * Submit a DOE task (request/response) to the DOE mailbox to be processed.
+ * Returns upon queueing the task object.  If the queue is full this function
+ * will sleep until there is room in the queue.
+ *
+ * task->complete will be called when the state machine is done processing this
+ * task.
+ *
+ * Excess data will be discarded.
+ *
+ * RETURNS: 0 when task has been successfully queued, -ERRNO on error
+ */
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type))
+		return -EINVAL;
+
+	/*
+	 * DOE requests must be a whole number of DW and the response needs to
+	 * be big enough for at least 1 DW
+	 */
+	if (task->request_pl_sz % sizeof(u32) ||
+	    task->response_pl_sz < sizeof(u32))
+		return -EINVAL;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
+		return -EIO;
+
+	task->doe_mb = doe_mb;
+	INIT_WORK(&task->work, doe_statemachine_work);
+	queue_work(doe_mb->work_queue, &task->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_doe_submit_task);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
new file mode 100644
index 000000000000..ed9b4df792b8
--- /dev/null
+++ b/include/linux/pci-doe.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *     Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#ifndef LINUX_PCI_DOE_H
+#define LINUX_PCI_DOE_H
+
+struct pci_doe_protocol {
+	u16 vid;
+	u8 type;
+};
+
+struct pci_doe_mb;
+
+/**
+ * struct pci_doe_task - represents a single query/response
+ *
+ * @prot: DOE Protocol
+ * @request_pl: The request payload
+ * @request_pl_sz: Size of the request payload (bytes)
+ * @response_pl: The response payload
+ * @response_pl_sz: Size of the response payload (bytes)
+ * @rv: Return value.  Length of received response or error (bytes)
+ * @complete: Called when task is complete
+ * @private: Private data for the consumer
+ * @work: Used internally by the mailbox
+ * @doe_mb: Used internally by the mailbox
+ *
+ * The payload sizes and rv are specified in bytes with the following
+ * restrictions concerning the protocol.
+ *
+ *	1) The request_pl_sz must be a multiple of double words (4 bytes)
+ *	2) The response_pl_sz must be >= a single double word (4 bytes)
+ *	3) rv is returned as bytes but it will be a multiple of double words
+ *
+ * NOTE there is no need for the caller to initialize work or doe_mb.
+ */
+struct pci_doe_task {
+	struct pci_doe_protocol prot;
+	u32 *request_pl;
+	size_t request_pl_sz;
+	u32 *response_pl;
+	size_t response_pl_sz;
+	int rv;
+	void (*complete)(struct pci_doe_task *task);
+	void *private;
+
+	/* No need for the user to initialize these fields */
+	struct work_struct work;
+	struct pci_doe_mb *doe_mb;
+};
+
+/**
+ * pci_doe_for_each_off - Iterate each DOE capability
+ * @pdev: struct pci_dev to iterate
+ * @off: u16 of config space offset of each mailbox capability found
+ */
+#define pci_doe_for_each_off(pdev, off) \
+	for (off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE); \
+		off > 0; \
+		off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE))
+
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
+
+#endif
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 108f8523fa04..57b8e2ffb1dd 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -737,7 +737,8 @@
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
+#define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1103,4 +1104,30 @@
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK		0x000000F0
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT	4
 
+/* Data Object Exchange */
+#define PCI_DOE_CAP		0x04    /* DOE Capabilities Register */
+#define  PCI_DOE_CAP_INT_SUP			0x00000001  /* Interrupt Support */
+#define  PCI_DOE_CAP_INT_MSG_NUM		0x00000ffe  /* Interrupt Message Number */
+#define PCI_DOE_CTRL		0x08    /* DOE Control Register */
+#define  PCI_DOE_CTRL_ABORT			0x00000001  /* DOE Abort */
+#define  PCI_DOE_CTRL_INT_EN			0x00000002  /* DOE Interrupt Enable */
+#define  PCI_DOE_CTRL_GO			0x80000000  /* DOE Go */
+#define PCI_DOE_STATUS		0x0c    /* DOE Status Register */
+#define  PCI_DOE_STATUS_BUSY			0x00000001  /* DOE Busy */
+#define  PCI_DOE_STATUS_INT_STATUS		0x00000002  /* DOE Interrupt Status */
+#define  PCI_DOE_STATUS_ERROR			0x00000004  /* DOE Error */
+#define  PCI_DOE_STATUS_DATA_OBJECT_READY	0x80000000  /* Data Object Ready */
+#define PCI_DOE_WRITE		0x10    /* DOE Write Data Mailbox Register */
+#define PCI_DOE_READ		0x14    /* DOE Read Data Mailbox Register */
+
+/* DOE Data Object - note not actually registers */
+#define PCI_DOE_DATA_OBJECT_HEADER_1_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_HEADER_1_TYPE		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH		0x0003ffff
+
+#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX		0x000000ff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 9d6794feeb90903b10c34bddd9c74c992447ce83 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 19 Jul 2022 13:52:48 -0700
Subject: driver-core: Introduce BIN_ATTR_ADMIN_{RO,RW}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many binary attributes need to limit access to CAP_SYS_ADMIN only; ie
many binary attributes specify is_visible with 0400 or 0600.

Make setting the permissions of such attributes more explicit by
defining BIN_ATTR_ADMIN_{RO,RW}.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Krzysztof Wilczyński <kw@linux.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20220719205249.566684-6-ira.weiny@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/sysfs.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e3f1e8ac1f85..fd3fe5c8c17f 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -235,6 +235,22 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)
 #define BIN_ATTR_RW(_name, _size)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
 
+
+#define __BIN_ATTR_ADMIN_RO(_name, _size) {					\
+	.attr	= { .name = __stringify(_name), .mode = 0400 },		\
+	.read	= _name##_read,						\
+	.size	= _size,						\
+}
+
+#define __BIN_ATTR_ADMIN_RW(_name, _size)					\
+	__BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)
+
+#define BIN_ATTR_ADMIN_RO(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)
+
+#define BIN_ATTR_ADMIN_RW(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
-- 
cgit v1.2.3


From ce4b4657ff18925c315855aa290e93c5fa652d96 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 19 Jul 2022 21:02:48 -0300
Subject: vfio: Replace the DMA unmapping notifier with a callback

Instead of having drivers register the notifier with explicit code just
have them provide a dma_unmap callback op in their driver ops and rely on
the core code to wire it up.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Reviewed-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-681e038e30fd+78-vfio_unmap_notif_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/gpu/drm/i915/gvt/gvt.h        |   1 -
 drivers/gpu/drm/i915/gvt/kvmgt.c      |  75 +++++---------------
 drivers/s390/cio/vfio_ccw_ops.c       |  39 +++-------
 drivers/s390/cio/vfio_ccw_private.h   |   2 -
 drivers/s390/crypto/vfio_ap_ops.c     |  53 +++-----------
 drivers/s390/crypto/vfio_ap_private.h |   3 -
 drivers/vfio/vfio.c                   | 129 ++++++++++++----------------------
 drivers/vfio/vfio.h                   |   3 +
 include/linux/vfio.h                  |  21 ++----
 9 files changed, 86 insertions(+), 240 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index aee1a45da74b..705689e64011 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -226,7 +226,6 @@ struct intel_vgpu {
 	unsigned long nr_cache_entries;
 	struct mutex cache_lock;
 
-	struct notifier_block iommu_notifier;
 	atomic_t released;
 
 	struct kvm_page_track_notifier_node track_node;
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index e2f6c56ab342..ecd5bb37b63a 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -729,34 +729,25 @@ int intel_gvt_set_edid(struct intel_vgpu *vgpu, int port_num)
 	return ret;
 }
 
-static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
-				     unsigned long action, void *data)
+static void intel_vgpu_dma_unmap(struct vfio_device *vfio_dev, u64 iova,
+				 u64 length)
 {
-	struct intel_vgpu *vgpu =
-		container_of(nb, struct intel_vgpu, iommu_notifier);
-
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-		struct gvt_dma *entry;
-		unsigned long iov_pfn, end_iov_pfn;
+	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
+	struct gvt_dma *entry;
+	u64 iov_pfn = iova >> PAGE_SHIFT;
+	u64 end_iov_pfn = iov_pfn + length / PAGE_SIZE;
 
-		iov_pfn = unmap->iova >> PAGE_SHIFT;
-		end_iov_pfn = iov_pfn + unmap->size / PAGE_SIZE;
+	mutex_lock(&vgpu->cache_lock);
+	for (; iov_pfn < end_iov_pfn; iov_pfn++) {
+		entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
+		if (!entry)
+			continue;
 
-		mutex_lock(&vgpu->cache_lock);
-		for (; iov_pfn < end_iov_pfn; iov_pfn++) {
-			entry = __gvt_cache_find_gfn(vgpu, iov_pfn);
-			if (!entry)
-				continue;
-
-			gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
-					   entry->size);
-			__gvt_cache_remove_entry(vgpu, entry);
-		}
-		mutex_unlock(&vgpu->cache_lock);
+		gvt_dma_unmap_page(vgpu, entry->gfn, entry->dma_addr,
+				   entry->size);
+		__gvt_cache_remove_entry(vgpu, entry);
 	}
-
-	return NOTIFY_OK;
+	mutex_unlock(&vgpu->cache_lock);
 }
 
 static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu)
@@ -783,36 +774,20 @@ out:
 static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
 {
 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
-	unsigned long events;
-	int ret;
-
-	vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
 
-	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-	ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events,
-				     &vgpu->iommu_notifier);
-	if (ret != 0) {
-		gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
-			ret);
-		goto out;
-	}
-
-	ret = -EEXIST;
 	if (vgpu->attached)
-		goto undo_iommu;
+		return -EEXIST;
 
-	ret = -ESRCH;
 	if (!vgpu->vfio_device.kvm ||
 	    vgpu->vfio_device.kvm->mm != current->mm) {
 		gvt_vgpu_err("KVM is required to use Intel vGPU\n");
-		goto undo_iommu;
+		return -ESRCH;
 	}
 
 	kvm_get_kvm(vgpu->vfio_device.kvm);
 
-	ret = -EEXIST;
 	if (__kvmgt_vgpu_exist(vgpu))
-		goto undo_iommu;
+		return -EEXIST;
 
 	vgpu->attached = true;
 
@@ -831,12 +806,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
 
 	atomic_set(&vgpu->released, 0);
 	return 0;
-
-undo_iommu:
-	vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY,
-				 &vgpu->iommu_notifier);
-out:
-	return ret;
 }
 
 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
@@ -853,8 +822,6 @@ static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu)
 static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
 {
 	struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
-	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
-	int ret;
 
 	if (!vgpu->attached)
 		return;
@@ -864,11 +831,6 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
 
 	intel_gvt_release_vgpu(vgpu);
 
-	ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_IOMMU_NOTIFY,
-				       &vgpu->iommu_notifier);
-	drm_WARN(&i915->drm, ret,
-		 "vfio_unregister_notifier for iommu failed: %d\n", ret);
-
 	debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs));
 
 	kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
@@ -1610,6 +1572,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = {
 	.write		= intel_vgpu_write,
 	.mmap		= intel_vgpu_mmap,
 	.ioctl		= intel_vgpu_ioctl,
+	.dma_unmap	= intel_vgpu_dma_unmap,
 };
 
 static int intel_vgpu_probe(struct mdev_device *mdev)
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index bc2176421dc5..0047fd88f938 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -33,30 +33,16 @@ static int vfio_ccw_mdev_reset(struct vfio_ccw_private *private)
 	return 0;
 }
 
-static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
-				  unsigned long action,
-				  void *data)
+static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length)
 {
 	struct vfio_ccw_private *private =
-		container_of(nb, struct vfio_ccw_private, nb);
-
-	/*
-	 * Vendor drivers MUST unpin pages in response to an
-	 * invalidation.
-	 */
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-
-		if (!cp_iova_pinned(&private->cp, unmap->iova))
-			return NOTIFY_OK;
-
-		if (vfio_ccw_mdev_reset(private))
-			return NOTIFY_BAD;
+		container_of(vdev, struct vfio_ccw_private, vdev);
 
-		return NOTIFY_OK;
-	}
+	/* Drivers MUST unpin pages in response to an invalidation. */
+	if (!cp_iova_pinned(&private->cp, iova))
+		return;
 
-	return NOTIFY_DONE;
+	vfio_ccw_mdev_reset(private);
 }
 
 static ssize_t name_show(struct mdev_type *mtype,
@@ -154,23 +140,15 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
 {
 	struct vfio_ccw_private *private =
 		container_of(vdev, struct vfio_ccw_private, vdev);
-	unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
 	int ret;
 
 	/* Device cannot simply be opened again from this state */
 	if (private->state == VFIO_CCW_STATE_NOT_OPER)
 		return -EINVAL;
 
-	private->nb.notifier_call = vfio_ccw_mdev_notifier;
-
-	ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY,
-				     &events, &private->nb);
-	if (ret)
-		return ret;
-
 	ret = vfio_ccw_register_async_dev_regions(private);
 	if (ret)
-		goto out_unregister;
+		return ret;
 
 	ret = vfio_ccw_register_schib_dev_regions(private);
 	if (ret)
@@ -190,7 +168,6 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
 
 out_unregister:
 	vfio_ccw_unregister_dev_regions(private);
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb);
 	return ret;
 }
 
@@ -201,7 +178,6 @@ static void vfio_ccw_mdev_close_device(struct vfio_device *vdev)
 
 	vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE);
 	vfio_ccw_unregister_dev_regions(private);
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb);
 }
 
 static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private,
@@ -624,6 +600,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = {
 	.write = vfio_ccw_mdev_write,
 	.ioctl = vfio_ccw_mdev_ioctl,
 	.request = vfio_ccw_mdev_request,
+	.dma_unmap = vfio_ccw_dma_unmap,
 };
 
 struct mdev_driver vfio_ccw_mdev_driver = {
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index abac532bf03e..cd24b7fada91 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -73,7 +73,6 @@ struct vfio_ccw_crw {
  * @state: internal state of the device
  * @completion: synchronization helper of the I/O completion
  * @avail: available for creating a mediated device
- * @nb: notifier for vfio events
  * @io_region: MMIO region to input/output I/O arguments/results
  * @io_mutex: protect against concurrent update of I/O regions
  * @region: additional regions for other subchannel operations
@@ -96,7 +95,6 @@ struct vfio_ccw_private {
 	int			state;
 	struct completion	*completion;
 	atomic_t		avail;
-	struct notifier_block	nb;
 	struct ccw_io_region	*io_region;
 	struct mutex		io_mutex;
 	struct vfio_ccw_region *region;
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index a7d2a95796d3..bb1a1677c5c2 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1226,34 +1226,14 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
 	return 0;
 }
 
-/**
- * vfio_ap_mdev_iommu_notifier - IOMMU notifier callback
- *
- * @nb: The notifier block
- * @action: Action to be taken
- * @data: data associated with the request
- *
- * For an UNMAP request, unpin the guest IOVA (the NIB guest address we
- * pinned before). Other requests are ignored.
- *
- * Return: for an UNMAP request, NOFITY_OK; otherwise NOTIFY_DONE.
- */
-static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb,
-				       unsigned long action, void *data)
+static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova,
+				   u64 length)
 {
-	struct ap_matrix_mdev *matrix_mdev;
-
-	matrix_mdev = container_of(nb, struct ap_matrix_mdev, iommu_notifier);
-
-	if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
-		struct vfio_iommu_type1_dma_unmap *unmap = data;
-		unsigned long g_pfn = unmap->iova >> PAGE_SHIFT;
-
-		vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
-		return NOTIFY_OK;
-	}
+	struct ap_matrix_mdev *matrix_mdev =
+		container_of(vdev, struct ap_matrix_mdev, vdev);
+	unsigned long g_pfn = iova >> PAGE_SHIFT;
 
-	return NOTIFY_DONE;
+	vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
 }
 
 /**
@@ -1380,27 +1360,11 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev)
 {
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
-	unsigned long events;
-	int ret;
 
 	if (!vdev->kvm)
 		return -EINVAL;
 
-	ret = vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
-	if (ret)
-		return ret;
-
-	matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier;
-	events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-	ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events,
-				     &matrix_mdev->iommu_notifier);
-	if (ret)
-		goto err_kvm;
-	return 0;
-
-err_kvm:
-	vfio_ap_mdev_unset_kvm(matrix_mdev);
-	return ret;
+	return vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
 }
 
 static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
@@ -1408,8 +1372,6 @@ static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
 
-	vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY,
-				 &matrix_mdev->iommu_notifier);
 	vfio_ap_mdev_unset_kvm(matrix_mdev);
 }
 
@@ -1461,6 +1423,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = {
 	.open_device = vfio_ap_mdev_open_device,
 	.close_device = vfio_ap_mdev_close_device,
 	.ioctl = vfio_ap_mdev_ioctl,
+	.dma_unmap = vfio_ap_mdev_dma_unmap,
 };
 
 static struct mdev_driver vfio_ap_matrix_driver = {
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index a26efd804d0d..abb59d59f81b 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -81,8 +81,6 @@ struct ap_matrix {
  * @node:	allows the ap_matrix_mdev struct to be added to a list
  * @matrix:	the adapters, usage domains and control domains assigned to the
  *		mediated matrix device.
- * @iommu_notifier: notifier block used for specifying callback function for
- *		    handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even
  * @kvm:	the struct holding guest's state
  * @pqap_hook:	the function pointer to the interception handler for the
  *		PQAP(AQIC) instruction.
@@ -92,7 +90,6 @@ struct ap_matrix_mdev {
 	struct vfio_device vdev;
 	struct list_head node;
 	struct ap_matrix matrix;
-	struct notifier_block iommu_notifier;
 	struct kvm *kvm;
 	crypto_hook pqap_hook;
 	struct mdev_device *mdev;
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index bd84ca7c5e35..83c375fa2421 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -231,6 +231,9 @@ int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 {
 	struct vfio_iommu_driver *driver, *tmp;
 
+	if (WARN_ON(!ops->register_notifier != !ops->unregister_notifier))
+		return -EINVAL;
+
 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 	if (!driver)
 		return -ENOMEM;
@@ -1079,8 +1082,20 @@ static void vfio_device_unassign_container(struct vfio_device *device)
 	up_write(&device->group->group_rwsem);
 }
 
+static int vfio_iommu_notifier(struct notifier_block *nb, unsigned long action,
+			       void *data)
+{
+	struct vfio_device *vfio_device =
+		container_of(nb, struct vfio_device, iommu_nb);
+	struct vfio_iommu_type1_dma_unmap *unmap = data;
+
+	vfio_device->ops->dma_unmap(vfio_device, unmap->iova, unmap->size);
+	return NOTIFY_OK;
+}
+
 static struct file *vfio_device_open(struct vfio_device *device)
 {
+	struct vfio_iommu_driver *iommu_driver;
 	struct file *filep;
 	int ret;
 
@@ -1111,6 +1126,18 @@ static struct file *vfio_device_open(struct vfio_device *device)
 			if (ret)
 				goto err_undo_count;
 		}
+
+		iommu_driver = device->group->container->iommu_driver;
+		if (device->ops->dma_unmap && iommu_driver &&
+		    iommu_driver->ops->register_notifier) {
+			unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+
+			device->iommu_nb.notifier_call = vfio_iommu_notifier;
+			iommu_driver->ops->register_notifier(
+				device->group->container->iommu_data, &events,
+				&device->iommu_nb);
+		}
+
 		up_read(&device->group->group_rwsem);
 	}
 	mutex_unlock(&device->dev_set->lock);
@@ -1145,8 +1172,16 @@ static struct file *vfio_device_open(struct vfio_device *device)
 err_close_device:
 	mutex_lock(&device->dev_set->lock);
 	down_read(&device->group->group_rwsem);
-	if (device->open_count == 1 && device->ops->close_device)
+	if (device->open_count == 1 && device->ops->close_device) {
 		device->ops->close_device(device);
+
+		iommu_driver = device->group->container->iommu_driver;
+		if (device->ops->dma_unmap && iommu_driver &&
+		    iommu_driver->ops->unregister_notifier)
+			iommu_driver->ops->unregister_notifier(
+				device->group->container->iommu_data,
+				&device->iommu_nb);
+	}
 err_undo_count:
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
@@ -1341,12 +1376,20 @@ static const struct file_operations vfio_group_fops = {
 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
+	struct vfio_iommu_driver *iommu_driver;
 
 	mutex_lock(&device->dev_set->lock);
 	vfio_assert_device_open(device);
 	down_read(&device->group->group_rwsem);
 	if (device->open_count == 1 && device->ops->close_device)
 		device->ops->close_device(device);
+
+	iommu_driver = device->group->container->iommu_driver;
+	if (device->ops->dma_unmap && iommu_driver &&
+	    iommu_driver->ops->unregister_notifier)
+		iommu_driver->ops->unregister_notifier(
+			device->group->container->iommu_data,
+			&device->iommu_nb);
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
 	if (device->open_count == 0)
@@ -2029,90 +2072,6 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
 }
 EXPORT_SYMBOL(vfio_dma_rw);
 
-static int vfio_register_iommu_notifier(struct vfio_group *group,
-					unsigned long *events,
-					struct notifier_block *nb)
-{
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-	int ret;
-
-	lockdep_assert_held_read(&group->group_rwsem);
-
-	container = group->container;
-	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->register_notifier))
-		ret = driver->ops->register_notifier(container->iommu_data,
-						     events, nb);
-	else
-		ret = -ENOTTY;
-
-	return ret;
-}
-
-static int vfio_unregister_iommu_notifier(struct vfio_group *group,
-					  struct notifier_block *nb)
-{
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-	int ret;
-
-	lockdep_assert_held_read(&group->group_rwsem);
-
-	container = group->container;
-	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->unregister_notifier))
-		ret = driver->ops->unregister_notifier(container->iommu_data,
-						       nb);
-	else
-		ret = -ENOTTY;
-
-	return ret;
-}
-
-int vfio_register_notifier(struct vfio_device *device,
-			   enum vfio_notify_type type, unsigned long *events,
-			   struct notifier_block *nb)
-{
-	struct vfio_group *group = device->group;
-	int ret;
-
-	if (!nb || !events || (*events == 0) ||
-	    !vfio_assert_device_open(device))
-		return -EINVAL;
-
-	switch (type) {
-	case VFIO_IOMMU_NOTIFY:
-		ret = vfio_register_iommu_notifier(group, events, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(vfio_register_notifier);
-
-int vfio_unregister_notifier(struct vfio_device *device,
-			     enum vfio_notify_type type,
-			     struct notifier_block *nb)
-{
-	struct vfio_group *group = device->group;
-	int ret;
-
-	if (!nb || !vfio_assert_device_open(device))
-		return -EINVAL;
-
-	switch (type) {
-	case VFIO_IOMMU_NOTIFY:
-		ret = vfio_unregister_iommu_notifier(group, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(vfio_unregister_notifier);
-
 /*
  * Module/class support
  */
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index a67130221151..25da02ca1568 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -33,6 +33,9 @@ enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
 };
 
+/* events for register_notifier() */
+#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
+
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 4d26e149db81..1f9fc7a9be9e 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -49,6 +49,7 @@ struct vfio_device {
 	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
+	struct notifier_block iommu_nb;
 };
 
 /**
@@ -65,6 +66,8 @@ struct vfio_device {
  * @match: Optional device name match callback (return: 0 for no-match, >0 for
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
+ * @dma_unmap: Called when userspace unmaps IOVA from the container
+ *             this device is attached to.
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
  */
 struct vfio_device_ops {
@@ -80,6 +83,7 @@ struct vfio_device_ops {
 	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
 	void	(*request)(struct vfio_device *vdev, unsigned int count);
 	int	(*match)(struct vfio_device *vdev, char *buf);
+	void	(*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
 };
@@ -164,23 +168,6 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-/* each type has independent events */
-enum vfio_notify_type {
-	VFIO_IOMMU_NOTIFY = 0,
-};
-
-/* events for VFIO_IOMMU_NOTIFY */
-#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	BIT(0)
-
-int vfio_register_notifier(struct vfio_device *device,
-			   enum vfio_notify_type type,
-			   unsigned long *required_events,
-			   struct notifier_block *nb);
-int vfio_unregister_notifier(struct vfio_device *device,
-			     enum vfio_notify_type type,
-			     struct notifier_block *nb);
-
-
 /*
  * Sub-module helpers
  */
-- 
cgit v1.2.3


From 8cfc5b60751bcf9b4c6bbab3f6a72d59e0156a89 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 19 Jul 2022 21:02:49 -0300
Subject: vfio: Replace the iommu notifier with a device list

Instead of bouncing the function call to the driver op through a blocking
notifier just have the iommu layer call it directly.

Register each device that is being attached to the iommu with the lower
driver which then threads them on a linked list and calls the appropriate
driver op at the right time.

Currently the only use is if dma_unmap() is defined.

Also, fully lock all the debugging tests on the pinning path that a
dma_unmap is registered.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v4-681e038e30fd+78-vfio_unmap_notif_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c             |  41 ++++------------
 drivers/vfio/vfio.h             |  12 ++---
 drivers/vfio/vfio_iommu_type1.c | 103 +++++++++++++++++++++++++---------------
 include/linux/vfio.h            |   2 +-
 4 files changed, 81 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 83c375fa2421..b3ce8073cfb1 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -231,7 +231,7 @@ int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 {
 	struct vfio_iommu_driver *driver, *tmp;
 
-	if (WARN_ON(!ops->register_notifier != !ops->unregister_notifier))
+	if (WARN_ON(!ops->register_device != !ops->unregister_device))
 		return -EINVAL;
 
 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
@@ -1082,17 +1082,6 @@ static void vfio_device_unassign_container(struct vfio_device *device)
 	up_write(&device->group->group_rwsem);
 }
 
-static int vfio_iommu_notifier(struct notifier_block *nb, unsigned long action,
-			       void *data)
-{
-	struct vfio_device *vfio_device =
-		container_of(nb, struct vfio_device, iommu_nb);
-	struct vfio_iommu_type1_dma_unmap *unmap = data;
-
-	vfio_device->ops->dma_unmap(vfio_device, unmap->iova, unmap->size);
-	return NOTIFY_OK;
-}
-
 static struct file *vfio_device_open(struct vfio_device *device)
 {
 	struct vfio_iommu_driver *iommu_driver;
@@ -1128,15 +1117,9 @@ static struct file *vfio_device_open(struct vfio_device *device)
 		}
 
 		iommu_driver = device->group->container->iommu_driver;
-		if (device->ops->dma_unmap && iommu_driver &&
-		    iommu_driver->ops->register_notifier) {
-			unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-
-			device->iommu_nb.notifier_call = vfio_iommu_notifier;
-			iommu_driver->ops->register_notifier(
-				device->group->container->iommu_data, &events,
-				&device->iommu_nb);
-		}
+		if (iommu_driver && iommu_driver->ops->register_device)
+			iommu_driver->ops->register_device(
+				device->group->container->iommu_data, device);
 
 		up_read(&device->group->group_rwsem);
 	}
@@ -1176,11 +1159,9 @@ err_close_device:
 		device->ops->close_device(device);
 
 		iommu_driver = device->group->container->iommu_driver;
-		if (device->ops->dma_unmap && iommu_driver &&
-		    iommu_driver->ops->unregister_notifier)
-			iommu_driver->ops->unregister_notifier(
-				device->group->container->iommu_data,
-				&device->iommu_nb);
+		if (iommu_driver && iommu_driver->ops->unregister_device)
+			iommu_driver->ops->unregister_device(
+				device->group->container->iommu_data, device);
 	}
 err_undo_count:
 	up_read(&device->group->group_rwsem);
@@ -1385,11 +1366,9 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 		device->ops->close_device(device);
 
 	iommu_driver = device->group->container->iommu_driver;
-	if (device->ops->dma_unmap && iommu_driver &&
-	    iommu_driver->ops->unregister_notifier)
-		iommu_driver->ops->unregister_notifier(
-			device->group->container->iommu_data,
-			&device->iommu_nb);
+	if (iommu_driver && iommu_driver->ops->unregister_device)
+		iommu_driver->ops->unregister_device(
+			device->group->container->iommu_data, device);
 	up_read(&device->group->group_rwsem);
 	device->open_count--;
 	if (device->open_count == 0)
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 25da02ca1568..4a7db1f3c33e 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -33,9 +33,6 @@ enum vfio_iommu_notify_type {
 	VFIO_IOMMU_CONTAINER_CLOSE = 0,
 };
 
-/* events for register_notifier() */
-#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
-
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
  */
@@ -58,11 +55,10 @@ struct vfio_iommu_driver_ops {
 				     unsigned long *phys_pfn);
 	int		(*unpin_pages)(void *iommu_data,
 				       unsigned long *user_pfn, int npage);
-	int		(*register_notifier)(void *iommu_data,
-					     unsigned long *events,
-					     struct notifier_block *nb);
-	int		(*unregister_notifier)(void *iommu_data,
-					       struct notifier_block *nb);
+	void		(*register_device)(void *iommu_data,
+					   struct vfio_device *vdev);
+	void		(*unregister_device)(void *iommu_data,
+					     struct vfio_device *vdev);
 	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
 				  void *data, size_t count, bool write);
 	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index db24062fb343..026a1d2553a2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -67,7 +67,8 @@ struct vfio_iommu {
 	struct list_head	iova_list;
 	struct mutex		lock;
 	struct rb_root		dma_list;
-	struct blocking_notifier_head notifier;
+	struct list_head	device_list;
+	struct mutex		device_list_lock;
 	unsigned int		dma_avail;
 	unsigned int		vaddr_invalid_count;
 	uint64_t		pgsize_bitmap;
@@ -865,8 +866,8 @@ again:
 		}
 	}
 
-	/* Fail if notifier list is empty */
-	if (!iommu->notifier.head) {
+	/* Fail if no dma_umap notifier is registered */
+	if (list_empty(&iommu->device_list)) {
 		ret = -EINVAL;
 		goto pin_done;
 	}
@@ -1287,6 +1288,35 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
 	return 0;
 }
 
+/*
+ * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
+ * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
+ * pages in response to an invalidation.
+ */
+static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
+				  struct vfio_dma *dma)
+{
+	struct vfio_device *device;
+
+	if (list_empty(&iommu->device_list))
+		return;
+
+	/*
+	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
+	 * pinned within the range. Since vfio_unpin_pages() will eventually
+	 * call back down to this code and try to obtain the iommu->lock we must
+	 * drop it.
+	 */
+	mutex_lock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
+
+	list_for_each_entry(device, &iommu->device_list, iommu_entry)
+		device->ops->dma_unmap(device, dma->iova, dma->size);
+
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_lock(&iommu->lock);
+}
+
 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 			     struct vfio_iommu_type1_dma_unmap *unmap,
 			     struct vfio_bitmap *bitmap)
@@ -1400,8 +1430,6 @@ again:
 		}
 
 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
-			struct vfio_iommu_type1_dma_unmap nb_unmap;
-
 			if (dma_last == dma) {
 				BUG_ON(++retries > 10);
 			} else {
@@ -1409,20 +1437,7 @@ again:
 				retries = 0;
 			}
 
-			nb_unmap.iova = dma->iova;
-			nb_unmap.size = dma->size;
-
-			/*
-			 * Notify anyone (mdev vendor drivers) to invalidate and
-			 * unmap iovas within the range we're about to unmap.
-			 * Vendor drivers MUST unpin pages in response to an
-			 * invalidation.
-			 */
-			mutex_unlock(&iommu->lock);
-			blocking_notifier_call_chain(&iommu->notifier,
-						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
-						    &nb_unmap);
-			mutex_lock(&iommu->lock);
+			vfio_notify_dma_unmap(iommu, dma);
 			goto again;
 		}
 
@@ -2475,7 +2490,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 
 		if (list_empty(&iommu->emulated_iommu_groups) &&
 		    list_empty(&iommu->domain_list)) {
-			WARN_ON(iommu->notifier.head);
+			WARN_ON(!list_empty(&iommu->device_list));
 			vfio_iommu_unmap_unpin_all(iommu);
 		}
 		goto detach_group_done;
@@ -2507,7 +2522,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		if (list_empty(&domain->group_list)) {
 			if (list_is_singular(&iommu->domain_list)) {
 				if (list_empty(&iommu->emulated_iommu_groups)) {
-					WARN_ON(iommu->notifier.head);
+					WARN_ON(!list_empty(
+						&iommu->device_list));
 					vfio_iommu_unmap_unpin_all(iommu);
 				} else {
 					vfio_iommu_unmap_unpin_reaccount(iommu);
@@ -2568,7 +2584,8 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	iommu->dma_avail = dma_entry_limit;
 	iommu->container_open = true;
 	mutex_init(&iommu->lock);
-	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
+	mutex_init(&iommu->device_list_lock);
+	INIT_LIST_HEAD(&iommu->device_list);
 	init_waitqueue_head(&iommu->vaddr_wait);
 	iommu->pgsize_bitmap = PAGE_MASK;
 	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
@@ -3005,28 +3022,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 	}
 }
 
-static int vfio_iommu_type1_register_notifier(void *iommu_data,
-					      unsigned long *events,
-					      struct notifier_block *nb)
+static void vfio_iommu_type1_register_device(void *iommu_data,
+					     struct vfio_device *vdev)
 {
 	struct vfio_iommu *iommu = iommu_data;
 
-	/* clear known events */
-	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-
-	/* refuse to register if still events remaining */
-	if (*events)
-		return -EINVAL;
+	if (!vdev->ops->dma_unmap)
+		return;
 
-	return blocking_notifier_chain_register(&iommu->notifier, nb);
+	/*
+	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
+	 * iteration for dma_unmap must be done under the device_list_lock.
+	 * Holding both locks here allows avoiding the device_list_lock in
+	 * several fast paths. See vfio_notify_dma_unmap()
+	 */
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_add(&vdev->iommu_entry, &iommu->device_list);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
 }
 
-static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
-						struct notifier_block *nb)
+static void vfio_iommu_type1_unregister_device(void *iommu_data,
+					       struct vfio_device *vdev)
 {
 	struct vfio_iommu *iommu = iommu_data;
 
-	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+	if (!vdev->ops->dma_unmap)
+		return;
+
+	mutex_lock(&iommu->lock);
+	mutex_lock(&iommu->device_list_lock);
+	list_del(&vdev->iommu_entry);
+	mutex_unlock(&iommu->device_list_lock);
+	mutex_unlock(&iommu->lock);
 }
 
 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
@@ -3160,8 +3189,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.detach_group		= vfio_iommu_type1_detach_group,
 	.pin_pages		= vfio_iommu_type1_pin_pages,
 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
-	.register_notifier	= vfio_iommu_type1_register_notifier,
-	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
+	.register_device	= vfio_iommu_type1_register_device,
+	.unregister_device	= vfio_iommu_type1_unregister_device,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
 	.notify			= vfio_iommu_type1_notify,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 1f9fc7a9be9e..19cefbaa3d06 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -49,7 +49,7 @@ struct vfio_device {
 	unsigned int open_count;
 	struct completion comp;
 	struct list_head group_next;
-	struct notifier_block iommu_nb;
+	struct list_head iommu_entry;
 };
 
 /**
-- 
cgit v1.2.3


From e0c7ea83f006ce8c3264ef8b6508a891d886ad4f Mon Sep 17 00:00:00 2001
From: Shengjiu Wang <shengjiu.wang@nxp.com>
Date: Thu, 7 Jul 2022 11:00:29 +0800
Subject: dmaengine: imx-sdma: Add FIFO stride support for multi FIFO script

The peripheral may have several FIFOs, but some case just select
some FIFOs from them for data transfer, which means FIFO0 and FIFO2
may be selected. So add FIFO address stride support, 0 means all FIFOs
are continuous, 1 means 1 word stride between FIFOs. All stride between
FIFOs should be same.

Another option words_per_fifo means how many audio channel data copied
to one FIFO one time, 1 means one channel per FIFO, 2 means 2 channels
per FIFO.

If 'n_fifos_src =  4' and 'words_per_fifo = 2', it means the first two
words(channels) fetch from FIFO0 and then jump to FIFO1 for next two words,
and so on after the last FIFO3 fetched, roll back to FIFO0.

Signed-off-by: Joy Zou <joy.zou@nxp.com>
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Link: https://lore.kernel.org/r/1657162829-9273-1-git-send-email-shengjiu.wang@nxp.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c      | 27 +++++++++++++++++++++++++--
 include/linux/dma/imx-dma.h | 13 +++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 8a5d4d4a4dff..e302089784ed 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -183,6 +183,8 @@
 				 BIT(DMA_DEV_TO_DEV))
 
 #define SDMA_WATERMARK_LEVEL_N_FIFOS	GENMASK(15, 12)
+#define SDMA_WATERMARK_LEVEL_OFF_FIFOS  GENMASK(19, 16)
+#define SDMA_WATERMARK_LEVEL_WORDS_PER_FIFO   GENMASK(31, 28)
 #define SDMA_WATERMARK_LEVEL_SW_DONE	BIT(23)
 
 #define SDMA_DONE0_CONFIG_DONE_SEL	BIT(7)
@@ -429,6 +431,9 @@ struct sdma_desc {
  * @n_fifos_src:	number of source device fifos
  * @n_fifos_dst:	number of destination device fifos
  * @sw_done:		software done flag
+ * @stride_fifos_src:	stride for source device FIFOs
+ * @stride_fifos_dst:	stride for destination device FIFOs
+ * @words_per_fifo:	copy number of words one time for one FIFO
  */
 struct sdma_channel {
 	struct virt_dma_chan		vc;
@@ -456,6 +461,9 @@ struct sdma_channel {
 	bool				is_ram_script;
 	unsigned int			n_fifos_src;
 	unsigned int			n_fifos_dst;
+	unsigned int			stride_fifos_src;
+	unsigned int			stride_fifos_dst;
+	unsigned int			words_per_fifo;
 	bool				sw_done;
 };
 
@@ -1245,17 +1253,29 @@ static void sdma_set_watermarklevel_for_p2p(struct sdma_channel *sdmac)
 static void sdma_set_watermarklevel_for_sais(struct sdma_channel *sdmac)
 {
 	unsigned int n_fifos;
+	unsigned int stride_fifos;
+	unsigned int words_per_fifo;
 
 	if (sdmac->sw_done)
 		sdmac->watermark_level |= SDMA_WATERMARK_LEVEL_SW_DONE;
 
-	if (sdmac->direction == DMA_DEV_TO_MEM)
+	if (sdmac->direction == DMA_DEV_TO_MEM) {
 		n_fifos = sdmac->n_fifos_src;
-	else
+		stride_fifos = sdmac->stride_fifos_src;
+	} else {
 		n_fifos = sdmac->n_fifos_dst;
+		stride_fifos = sdmac->stride_fifos_dst;
+	}
+
+	words_per_fifo = sdmac->words_per_fifo;
 
 	sdmac->watermark_level |=
 			FIELD_PREP(SDMA_WATERMARK_LEVEL_N_FIFOS, n_fifos);
+	sdmac->watermark_level |=
+			FIELD_PREP(SDMA_WATERMARK_LEVEL_OFF_FIFOS, stride_fifos);
+	if (words_per_fifo)
+		sdmac->watermark_level |=
+			FIELD_PREP(SDMA_WATERMARK_LEVEL_WORDS_PER_FIFO, (words_per_fifo - 1));
 }
 
 static int sdma_config_channel(struct dma_chan *chan)
@@ -1769,6 +1789,9 @@ static int sdma_config(struct dma_chan *chan,
 		}
 		sdmac->n_fifos_src = sdmacfg->n_fifos_src;
 		sdmac->n_fifos_dst = sdmacfg->n_fifos_dst;
+		sdmac->stride_fifos_src = sdmacfg->stride_fifos_src;
+		sdmac->stride_fifos_dst = sdmacfg->stride_fifos_dst;
+		sdmac->words_per_fifo = sdmacfg->words_per_fifo;
 		sdmac->sw_done = sdmacfg->sw_done;
 	}
 
diff --git a/include/linux/dma/imx-dma.h b/include/linux/dma/imx-dma.h
index 8887762360d4..f487a4fa103a 100644
--- a/include/linux/dma/imx-dma.h
+++ b/include/linux/dma/imx-dma.h
@@ -70,6 +70,16 @@ static inline int imx_dma_is_general_purpose(struct dma_chan *chan)
  * struct sdma_peripheral_config - SDMA config for audio
  * @n_fifos_src: Number of FIFOs for recording
  * @n_fifos_dst: Number of FIFOs for playback
+ * @stride_fifos_src: FIFO address stride for recording, 0 means all FIFOs are
+ *                    continuous, 1 means 1 word stride between FIFOs. All stride
+ *                    between FIFOs should be same.
+ * @stride_fifos_dst: FIFO address stride for playback
+ * @words_per_fifo: numbers of words per FIFO fetch/fill, 1 means
+ *                  one channel per FIFO, 2 means 2 channels per FIFO..
+ *                  If 'n_fifos_src =  4' and 'words_per_fifo = 2', it
+ *                  means the first two words(channels) fetch from FIFO0
+ *                  and then jump to FIFO1 for next two words, and so on
+ *                  after the last FIFO3 fetched, roll back to FIFO0.
  * @sw_done: Use software done. Needed for PDM (micfil)
  *
  * Some i.MX Audio devices (SAI, micfil) have multiple successive FIFO
@@ -82,6 +92,9 @@ static inline int imx_dma_is_general_purpose(struct dma_chan *chan)
 struct sdma_peripheral_config {
 	int n_fifos_src;
 	int n_fifos_dst;
+	int stride_fifos_src;
+	int stride_fifos_dst;
+	int words_per_fifo;
 	bool sw_done;
 };
 
-- 
cgit v1.2.3


From 974854ab0728532600c72e41a44d6ce1cf8f20a4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 12 Jul 2022 18:37:54 -0700
Subject: cxl/acpi: Track CXL resources in iomem_resource

Recall that CXL capable address ranges, on ACPI platforms, are published
in the CEDT.CFMWS (CXL Early Discovery Table: CXL Fixed Memory Window
Structures). These windows represent both the actively mapped capacity
and the potential address space that can be dynamically assigned to a
new CXL decode configuration (region / interleave-set).

CXL endpoints like DDR DIMMs can be mapped at any physical address
including 0 and legacy ranges.

There is an expectation and requirement that the /proc/iomem interface
and the iomem_resource tree in the kernel reflect the full set of
platform address ranges. I.e. that every address range that platform
firmware and bus drivers enumerate be reflected as an iomem_resource
entry. The hard requirement to do this for CXL arises from the fact that
facilities like CONFIG_DEVICE_PRIVATE expect to be able to treat empty
iomem_resource ranges as free for software to use as proxy address
space. Without CXL publishing its potential address ranges in
iomem_resource, the CONFIG_DEVICE_PRIVATE mechanism may inadvertently
steal capacity reserved for runtime provisioning of new CXL regions.

So, iomem_resource needs to know about both active and potential CXL
resource ranges. The active CXL resources might already be reflected in
iomem_resource as "System RAM". insert_resource_expand_to_fit() handles
re-parenting "System RAM" underneath a CXL window.

The "_expand_to_fit()" behavior handles cases where a CXL window is not
a strict superset of an existing entry in the iomem_resource tree. The
"_expand_to_fit()" behavior is acceptable from the perspective of
resource allocation. The expansion happens because a conflicting
resource range is already populated, which means the resource boundary
expansion does not result in any additional free CXL address space being
made available. CXL address space allocation is always bounded by the
orginal unexpanded address range.

However, the potential for expansion does mean that something like
walk_iomem_res_desc(IORES_DESC_CXL...) can only return fuzzy answers on
corner case platforms that cause the resource tree to expand a CXL
window resource over a range that is not decoded by CXL. This would be
an odd platform configuration, but if it becomes a problem in practice
the CXL subsytem could just publish an API that returns definitive
answers.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/165784325943.1758207.5310344844375305118.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/acpi.c     | 144 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ioport.h |   1 +
 kernel/resource.c      |   7 +++
 3 files changed, 149 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 62bf22ffb7aa..e2b6cbd04846 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -73,6 +73,8 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
 struct cxl_cfmws_context {
 	struct device *dev;
 	struct cxl_port *root_port;
+	struct resource *cxl_res;
+	int id;
 };
 
 static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
@@ -81,11 +83,13 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	int target_map[CXL_DECODER_MAX_INTERLEAVE];
 	struct cxl_cfmws_context *ctx = arg;
 	struct cxl_port *root_port = ctx->root_port;
+	struct resource *cxl_res = ctx->cxl_res;
 	struct cxl_switch_decoder *cxlsd;
 	struct device *dev = ctx->dev;
 	struct acpi_cedt_cfmws *cfmws;
 	struct cxl_decoder *cxld;
 	unsigned int ways, i, ig;
+	struct resource *res;
 	int rc;
 
 	cfmws = (struct acpi_cedt_cfmws *) header;
@@ -107,6 +111,23 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	for (i = 0; i < ways; i++)
 		target_map[i] = cfmws->interleave_targets[i];
 
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++);
+	if (!res->name)
+		goto err_name;
+
+	res->start = cfmws->base_hpa;
+	res->end = cfmws->base_hpa + cfmws->window_size - 1;
+	res->flags = IORESOURCE_MEM;
+
+	/* add to the local resource tracking to establish a sort order */
+	rc = insert_resource(cxl_res, res);
+	if (rc)
+		goto err_insert;
+
 	cxlsd = cxl_root_decoder_alloc(root_port, ways);
 	if (IS_ERR(cxld))
 		return 0;
@@ -115,8 +136,8 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
 	cxld->target_type = CXL_DECODER_EXPANDER;
 	cxld->hpa_range = (struct range) {
-		.start = cfmws->base_hpa,
-		.end = cfmws->base_hpa + cfmws->window_size - 1,
+		.start = res->start,
+		.end = res->end,
 	};
 	cxld->interleave_ways = ways;
 	cxld->interleave_granularity = ig;
@@ -137,6 +158,12 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 		cxld->hpa_range.start, cxld->hpa_range.end);
 
 	return 0;
+
+err_insert:
+	kfree(res->name);
+err_name:
+	kfree(res);
+	return -ENOMEM;
 }
 
 __mock struct acpi_device *to_cxl_host_bridge(struct device *host,
@@ -291,9 +318,101 @@ static void cxl_acpi_lock_reset_class(void *dev)
 	device_lock_reset_class(dev);
 }
 
+static void del_cxl_resource(struct resource *res)
+{
+	kfree(res->name);
+	kfree(res);
+}
+
+static void cxl_set_public_resource(struct resource *priv, struct resource *pub)
+{
+	priv->desc = (unsigned long) pub;
+}
+
+static struct resource *cxl_get_public_resource(struct resource *priv)
+{
+	return (struct resource *) priv->desc;
+}
+
+static void remove_cxl_resources(void *data)
+{
+	struct resource *res, *next, *cxl = data;
+
+	for (res = cxl->child; res; res = next) {
+		struct resource *victim = cxl_get_public_resource(res);
+
+		next = res->sibling;
+		remove_resource(res);
+
+		if (victim) {
+			remove_resource(victim);
+			kfree(victim);
+		}
+
+		del_cxl_resource(res);
+	}
+}
+
+/**
+ * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource
+ * @cxl_res: A standalone resource tree where each CXL window is a sibling
+ *
+ * Walk each CXL window in @cxl_res and add it to iomem_resource potentially
+ * expanding its boundaries to ensure that any conflicting resources become
+ * children. If a window is expanded it may then conflict with a another window
+ * entry and require the window to be truncated or trimmed. Consider this
+ * situation:
+ *
+ * |-- "CXL Window 0" --||----- "CXL Window 1" -----|
+ * |--------------- "System RAM" -------------|
+ *
+ * ...where platform firmware has established as System RAM resource across 2
+ * windows, but has left some portion of window 1 for dynamic CXL region
+ * provisioning. In this case "Window 0" will span the entirety of the "System
+ * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end
+ * of that "System RAM" resource.
+ */
+static int add_cxl_resources(struct resource *cxl_res)
+{
+	struct resource *res, *new, *next;
+
+	for (res = cxl_res->child; res; res = next) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = res->name;
+		new->start = res->start;
+		new->end = res->end;
+		new->flags = IORESOURCE_MEM;
+		new->desc = IORES_DESC_CXL;
+
+		/*
+		 * Record the public resource in the private cxl_res tree for
+		 * later removal.
+		 */
+		cxl_set_public_resource(res, new);
+
+		insert_resource_expand_to_fit(&iomem_resource, new);
+
+		next = res->sibling;
+		while (next && resource_overlaps(new, next)) {
+			if (resource_contains(new, next)) {
+				struct resource *_next = next->sibling;
+
+				remove_resource(next);
+				del_cxl_resource(next);
+				next = _next;
+			} else
+				next->start = new->end + 1;
+		}
+	}
+	return 0;
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
 	int rc;
+	struct resource *cxl_res;
 	struct cxl_port *root_port;
 	struct device *host = &pdev->dev;
 	struct acpi_device *adev = ACPI_COMPANION(host);
@@ -305,6 +424,14 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc)
 		return rc;
 
+	cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL);
+	if (!cxl_res)
+		return -ENOMEM;
+	cxl_res->name = "CXL mem";
+	cxl_res->start = 0;
+	cxl_res->end = -1;
+	cxl_res->flags = IORESOURCE_MEM;
+
 	root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
 	if (IS_ERR(root_port))
 		return PTR_ERR(root_port);
@@ -315,11 +442,22 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc < 0)
 		return rc;
 
+	rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res);
+	if (rc)
+		return rc;
+
 	ctx = (struct cxl_cfmws_context) {
 		.dev = host,
 		.root_port = root_port,
+		.cxl_res = cxl_res,
 	};
-	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	if (rc < 0)
+		return -ENXIO;
+
+	rc = add_cxl_resources(cxl_res);
+	if (rc)
+		return rc;
 
 	/*
 	 * Root level scanned with host-bridge as dports, now scan host-bridges
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index ec5f71f7135b..79d1ad6d6275 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,6 +141,7 @@ enum {
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
 	IORES_DESC_SOFT_RESERVED		= 8,
+	IORES_DESC_CXL				= 9,
 };
 
 /*
diff --git a/kernel/resource.c b/kernel/resource.c
index 34eaee179689..53a534db350e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -891,6 +891,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
 	}
 	write_unlock(&resource_lock);
 }
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
 
 /**
  * remove_resource - Remove a resource in the resource tree
-- 
cgit v1.2.3


From 14b80582c43e4f550acfd93c2b2cadbe36ea0874 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 20 May 2022 13:41:24 -0700
Subject: resource: Introduce alloc_free_mem_region()

The core of devm_request_free_mem_region() is a helper that searches for
free space in iomem_resource and performs __request_region_locked() on
the result of that search. The policy choices of the implementation
conform to what CONFIG_DEVICE_PRIVATE users want which is memory that is
immediately marked busy, and a preference to search for the first-fit
free range in descending order from the top of the physical address
space.

CXL has a need for a similar allocator, but with the following tweaks:

1/ Search for free space in ascending order

2/ Search for free space relative to a given CXL window

3/ 'insert' rather than 'request' the new resource given downstream
   drivers from the CXL Region driver (like the pmem or dax drivers) are
   responsible for request_mem_region() when they activate the memory
   range.

Rework __request_free_mem_region() into get_free_mem_region() which
takes a set of GFR_* (Get Free Region) flags to control the allocation
policy (ascending vs descending), and "busy" policy (insert_resource()
vs request_region()).

As part of the consolidation of the legacy GFR_REQUEST_REGION case with
the new default of just inserting a new resource into the free space
some minor cleanups like not checking for NULL before calling
devres_free() (which does its own check) is included.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/linux-cxl/20220420143406.GY2120790@nvidia.com/
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/165784333333.1758207.13703329337805274043.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/ioport.h |   2 +
 kernel/resource.c      | 178 +++++++++++++++++++++++++++++++++++++++----------
 mm/Kconfig             |   5 ++
 3 files changed, 150 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 79d1ad6d6275..616b683563a9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -330,6 +330,8 @@ struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size);
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name);
+struct resource *alloc_free_mem_region(struct resource *base,
+		unsigned long size, unsigned long align, const char *name);
 
 static inline void irqresource_disabled(struct resource *res, u32 irq)
 {
diff --git a/kernel/resource.c b/kernel/resource.c
index 53a534db350e..4c5e80b92f2f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
-static int __region_intersects(resource_size_t start, size_t size,
-			unsigned long flags, unsigned long desc)
+static int __region_intersects(struct resource *parent, resource_size_t start,
+			       size_t size, unsigned long flags,
+			       unsigned long desc)
 {
 	struct resource res;
 	int type = 0; int other = 0;
@@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size,
 	res.start = start;
 	res.end = start + size - 1;
 
-	for (p = iomem_resource.child; p ; p = p->sibling) {
+	for (p = parent->child; p ; p = p->sibling) {
 		bool is_type = (((p->flags & flags) == flags) &&
 				((desc == IORES_DESC_NONE) ||
 				 (desc == p->desc)));
@@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
 	int ret;
 
 	read_lock(&resource_lock);
-	ret = __region_intersects(start, size, flags, desc);
+	ret = __region_intersects(&iomem_resource, start, size, flags, desc);
 	read_unlock(&resource_lock);
 
 	return ret;
@@ -1780,62 +1781,139 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
-#ifdef CONFIG_DEVICE_PRIVATE
-static struct resource *__request_free_mem_region(struct device *dev,
-		struct resource *base, unsigned long size, const char *name)
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING		(1UL << 0)
+#define GFR_REQUEST_REGION	(1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+				 resource_size_t align, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING) {
+		resource_size_t end;
+
+		end = min_t(resource_size_t, base->end,
+			    (1ULL << MAX_PHYSMEM_BITS) - 1);
+		return end - size + 1;
+	}
+
+	return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+			 resource_size_t size, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING)
+		return addr > size && addr >= base->start;
+	/*
+	 * In the ascend case be careful that the last increment by
+	 * @size did not wrap 0.
+	 */
+	return addr > addr - size &&
+	       addr <= min_t(resource_size_t, base->end,
+			     (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+				unsigned long flags)
+{
+	if (flags & GFR_DESCENDING)
+		return addr - size;
+	return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+	struct resource *res = _res;
+
+	if (res->parent)
+		remove_resource(res);
+	free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+		    resource_size_t size, const unsigned long align,
+		    const char *name, const unsigned long desc,
+		    const unsigned long flags)
 {
-	resource_size_t end, addr;
+	resource_size_t addr;
 	struct resource *res;
 	struct region_devres *dr = NULL;
 
-	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
-	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = end - size + 1UL;
+	size = ALIGN(size, align);
 
 	res = alloc_resource(GFP_KERNEL);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
 
-	if (dev) {
+	if (dev && (flags & GFR_REQUEST_REGION)) {
 		dr = devres_alloc(devm_region_release,
 				sizeof(struct region_devres), GFP_KERNEL);
 		if (!dr) {
 			free_resource(res);
 			return ERR_PTR(-ENOMEM);
 		}
+	} else if (dev) {
+		if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+			return ERR_PTR(-ENOMEM);
 	}
 
 	write_lock(&resource_lock);
-	for (; addr > size && addr >= base->start; addr -= size) {
-		if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
-				REGION_DISJOINT)
+	for (addr = gfr_start(base, size, align, flags);
+	     gfr_continue(base, addr, size, flags);
+	     addr = gfr_next(addr, size, flags)) {
+		if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+		    REGION_DISJOINT)
 			continue;
 
-		if (__request_region_locked(res, &iomem_resource, addr, size,
-						name, 0))
-			break;
+		if (flags & GFR_REQUEST_REGION) {
+			if (__request_region_locked(res, &iomem_resource, addr,
+						    size, name, 0))
+				break;
 
-		if (dev) {
-			dr->parent = &iomem_resource;
-			dr->start = addr;
-			dr->n = size;
-			devres_add(dev, dr);
-		}
+			if (dev) {
+				dr->parent = &iomem_resource;
+				dr->start = addr;
+				dr->n = size;
+				devres_add(dev, dr);
+			}
 
-		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
-		write_unlock(&resource_lock);
+			res->desc = desc;
+			write_unlock(&resource_lock);
+
+
+			/*
+			 * A driver is claiming this region so revoke any
+			 * mappings.
+			 */
+			revoke_iomem(res);
+		} else {
+			res->start = addr;
+			res->end = addr + size - 1;
+			res->name = name;
+			res->desc = desc;
+			res->flags = IORESOURCE_MEM;
+
+			/*
+			 * Only succeed if the resource hosts an exclusive
+			 * range after the insert
+			 */
+			if (__insert_resource(base, res) || res->child)
+				break;
+
+			write_unlock(&resource_lock);
+		}
 
-		/*
-		 * A driver is claiming this region so revoke any mappings.
-		 */
-		revoke_iomem(res);
 		return res;
 	}
 	write_unlock(&resource_lock);
 
-	free_resource(res);
-	if (dr)
+	if (flags & GFR_REQUEST_REGION) {
+		free_resource(res);
 		devres_free(dr);
+	} else if (dev)
+		devm_release_action(dev, remove_free_mem_region, res);
 
 	return ERR_PTR(-ERANGE);
 }
@@ -1854,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev,
 struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size)
 {
-	return __request_free_mem_region(dev, base, size, dev_name(dev));
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+				   dev_name(dev),
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
 
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name)
 {
-	return __request_free_mem_region(NULL, base, size, name);
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(request_free_mem_region);
 
-#endif /* CONFIG_DEVICE_PRIVATE */
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+				       unsigned long size, unsigned long align,
+				       const char *name)
+{
+	/* Default of ascending direction and insert resource */
+	unsigned long flags = 0;
+
+	return get_free_mem_region(NULL, base, size, align, name,
+				   IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
 
 static int __init strict_iomem(char *str)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index 169e64192e48..a5b4fee2e3fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -994,9 +994,14 @@ config HMM_MIRROR
 	bool
 	depends on MMU
 
+config GET_FREE_REGION
+	depends on SPARSEMEM
+	bool
+
 config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ZONE_DEVICE
+	select GET_FREE_REGION
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
-- 
cgit v1.2.3


From dea997733575c5793ca77e166bbbf89097987eb4 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 22 Jul 2022 10:48:50 +0100
Subject: firmware: cs_dsp: Add pre_stop callback

The code already has a post_stop callback, add a matching pre_stop
callback to the client_ops that is called before execution is stopped.
This callback provides a convenient place for the client code to
communicate with the DSP before it is stopped.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220722094851.92521-1-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 3 +++
 include/linux/firmware/cirrus/cs_dsp.h | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 7dad6f57d970..b402f841d72c 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -2725,6 +2725,9 @@ void cs_dsp_stop(struct cs_dsp *dsp)
 
 	mutex_lock(&dsp->pwr_lock);
 
+	if (dsp->client_ops->pre_stop)
+		dsp->client_ops->pre_stop(dsp);
+
 	dsp->running = false;
 
 	if (dsp->ops->stop_core)
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 30055706cce2..6ab230218df0 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -189,7 +189,8 @@ struct cs_dsp {
  * @control_remove:	Called under the pwr_lock when a control is destroyed
  * @pre_run:		Called under the pwr_lock by cs_dsp_run() before the core is started
  * @post_run:		Called under the pwr_lock by cs_dsp_run() after the core is started
- * @post_stop:		Called under the pwr_lock by cs_dsp_stop()
+ * @pre_stop:		Called under the pwr_lock by cs_dsp_stop() before the core is stopped
+ * @post_stop:		Called under the pwr_lock by cs_dsp_stop() after the core is stopped
  * @watchdog_expired:	Called when a watchdog expiry is detected
  *
  * These callbacks give the cs_dsp client an opportunity to respond to events
@@ -200,6 +201,7 @@ struct cs_dsp_client_ops {
 	void (*control_remove)(struct cs_dsp_coeff_ctl *ctl);
 	int (*pre_run)(struct cs_dsp *dsp);
 	int (*post_run)(struct cs_dsp *dsp);
+	void (*pre_stop)(struct cs_dsp *dsp);
 	void (*post_stop)(struct cs_dsp *dsp);
 	void (*watchdog_expired)(struct cs_dsp *dsp);
 };
-- 
cgit v1.2.3


From a4b976552f122ea851f556769874022cf097741e Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 22 Jul 2022 10:48:51 +0100
Subject: firmware: cs_dsp: Add memory chunk helpers

Add helpers that can be layered on top of a buffer read from or to be
written to the DSP to faciliate accessing datastructures within the DSP
memory. These functions handle adding the padding bytes for the DSP,
converting to big endian, and packing arbitrary length data.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20220722094851.92521-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 104 +++++++++++++++++++++++++++++++++
 include/linux/firmware/cirrus/cs_dsp.h |  73 +++++++++++++++++++++++
 2 files changed, 177 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index b402f841d72c..81cc3d0f6eec 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -3180,6 +3180,110 @@ static const struct cs_dsp_ops cs_dsp_halo_ops = {
 	.stop_core = cs_dsp_halo_stop_core,
 };
 
+/**
+ * cs_dsp_chunk_write() - Format data to a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ * @nbits: Number of bits to write
+ * @val: Value to write
+ *
+ * This function sequentially writes values into the format required for DSP
+ * memory, it handles both inserting of the padding bytes and converting to
+ * big endian. Note that data is only committed to the chunk when a whole DSP
+ * words worth of data is available.
+ *
+ * Return: Zero for success, a negative number on error.
+ */
+int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val)
+{
+	int nwrite, i;
+
+	nwrite = min(CS_DSP_DATA_WORD_BITS - ch->cachebits, nbits);
+
+	ch->cache <<= nwrite;
+	ch->cache |= val >> (nbits - nwrite);
+	ch->cachebits += nwrite;
+	nbits -= nwrite;
+
+	if (ch->cachebits == CS_DSP_DATA_WORD_BITS) {
+		if (cs_dsp_chunk_end(ch))
+			return -ENOSPC;
+
+		ch->cache &= 0xFFFFFF;
+		for (i = 0; i < sizeof(ch->cache); i++, ch->cache <<= BITS_PER_BYTE)
+			*ch->data++ = (ch->cache & 0xFF000000) >> CS_DSP_DATA_WORD_BITS;
+
+		ch->bytes += sizeof(ch->cache);
+		ch->cachebits = 0;
+	}
+
+	if (nbits)
+		return cs_dsp_chunk_write(ch, nbits, val);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_write);
+
+/**
+ * cs_dsp_chunk_flush() - Pad remaining data with zero and commit to chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * As cs_dsp_chunk_write only writes data when a whole DSP word is ready to
+ * be written out it is possible that some data will remain in the cache, this
+ * function will pad that data with zeros upto a whole DSP word and write out.
+ *
+ * Return: Zero for success, a negative number on error.
+ */
+int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch)
+{
+	if (!ch->cachebits)
+		return 0;
+
+	return cs_dsp_chunk_write(ch, CS_DSP_DATA_WORD_BITS - ch->cachebits, 0);
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_flush);
+
+/**
+ * cs_dsp_chunk_read() - Parse data from a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ * @nbits: Number of bits to read
+ *
+ * This function sequentially reads values from a DSP memory formatted buffer,
+ * it handles both removing of the padding bytes and converting from big endian.
+ *
+ * Return: A negative number is returned on error, otherwise the read value.
+ */
+int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits)
+{
+	int nread, i;
+	u32 result;
+
+	if (!ch->cachebits) {
+		if (cs_dsp_chunk_end(ch))
+			return -ENOSPC;
+
+		ch->cache = 0;
+		ch->cachebits = CS_DSP_DATA_WORD_BITS;
+
+		for (i = 0; i < sizeof(ch->cache); i++, ch->cache <<= BITS_PER_BYTE)
+			ch->cache |= *ch->data++;
+
+		ch->bytes += sizeof(ch->cache);
+	}
+
+	nread = min(ch->cachebits, nbits);
+	nbits -= nread;
+
+	result = ch->cache >> ((sizeof(ch->cache) * BITS_PER_BYTE) - nread);
+	ch->cache <<= nread;
+	ch->cachebits -= nread;
+
+	if (nbits)
+		result = (result << nbits) | cs_dsp_chunk_read(ch, nbits);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_chunk_read);
+
 MODULE_DESCRIPTION("Cirrus Logic DSP Support");
 MODULE_AUTHOR("Simon Trimmer <simont@opensource.cirrus.com>");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 6ab230218df0..cad828e21c72 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -11,6 +11,7 @@
 #ifndef __CS_DSP_H
 #define __CS_DSP_H
 
+#include <linux/bits.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
 #include <linux/list.h>
@@ -34,6 +35,7 @@
 #define CS_ADSP2_REGION_ALL (CS_ADSP2_REGION_0 | CS_ADSP2_REGION_1_9)
 
 #define CS_DSP_DATA_WORD_SIZE                3
+#define CS_DSP_DATA_WORD_BITS                (3 * BITS_PER_BYTE)
 
 #define CS_DSP_ACKED_CTL_TIMEOUT_MS          100
 #define CS_DSP_ACKED_CTL_N_QUICKPOLLS        10
@@ -252,4 +254,75 @@ struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp,
 
 const char *cs_dsp_mem_region_name(unsigned int type);
 
+/**
+ * struct cs_dsp_chunk - Describes a buffer holding data formatted for the DSP
+ * @data:	Pointer to underlying buffer memory
+ * @max:	Pointer to end of the buffer memory
+ * @bytes:	Number of bytes read/written into the memory chunk
+ * @cache:	Temporary holding data as it is formatted
+ * @cachebits:	Number of bits of data currently in cache
+ */
+struct cs_dsp_chunk {
+	u8 *data;
+	u8 *max;
+	int bytes;
+
+	u32 cache;
+	int cachebits;
+};
+
+/**
+ * cs_dsp_chunk() - Create a DSP memory chunk
+ * @data: Pointer to the buffer that will be used to store data
+ * @size: Size of the buffer in bytes
+ *
+ * Return: A cs_dsp_chunk structure
+ */
+static inline struct cs_dsp_chunk cs_dsp_chunk(void *data, int size)
+{
+	struct cs_dsp_chunk ch = {
+		.data = data,
+		.max = data + size,
+	};
+
+	return ch;
+}
+
+/**
+ * cs_dsp_chunk_end() - Check if a DSP memory chunk is full
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: True if the whole buffer has been read/written
+ */
+static inline bool cs_dsp_chunk_end(struct cs_dsp_chunk *ch)
+{
+	return ch->data == ch->max;
+}
+
+/**
+ * cs_dsp_chunk_bytes() - Number of bytes written/read from a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: Number of bytes read/written to the buffer
+ */
+static inline int cs_dsp_chunk_bytes(struct cs_dsp_chunk *ch)
+{
+	return ch->bytes;
+}
+
+/**
+ * cs_dsp_chunk_valid_addr() - Check if an address is in a DSP memory chunk
+ * @ch: Pointer to the chunk structure
+ *
+ * Return: True if the given address is within the buffer
+ */
+static inline bool cs_dsp_chunk_valid_addr(struct cs_dsp_chunk *ch, void *addr)
+{
+	return (u8 *)addr >= ch->data && (u8 *)addr < ch->max;
+}
+
+int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val);
+int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch);
+int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits);
+
 #endif
-- 
cgit v1.2.3


From 478af190cb6c501efaa8de2b9c9418ece2e4d0bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 Jul 2022 10:59:17 -0700
Subject: iomap: remove iomap_writepage

Unused now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 15 ---------------
 include/linux/iomap.h  |  3 ---
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index afd260632836..7c4a56b3ac01 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1519,21 +1519,6 @@ unlock:
 	return 0;
 }
 
-int
-iomap_writepage(struct page *page, struct writeback_control *wbc,
-		struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops)
-{
-	int ret;
-
-	wpc->ops = ops;
-	ret = iomap_do_writepage(page, wbc, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
-}
-EXPORT_SYMBOL_GPL(iomap_writepage);
-
 int
 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 		struct iomap_writepage_ctx *wpc,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e552097c67e0..911888560d3e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -303,9 +303,6 @@ void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 		struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
-int iomap_writepage(struct page *page, struct writeback_control *wbc,
-		struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops);
 int iomap_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
 		const struct iomap_writeback_ops *ops);
-- 
cgit v1.2.3


From 189c6c33ff421def040b904fb14ef76c5bf5af4c Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Tue, 28 Jun 2022 16:30:59 +0200
Subject: PCI: Extend isolated function probing to s390

Like the jailhouse hypervisor, s390's PCI architecture allows passing
isolated PCI functions to a guest OS instance. As of now this is was not
utilized even with multi-function support as the s390 PCI code makes sure
that only virtual PCI busses including a function with devfn 0 are
presented to the PCI subsystem. A subsequent change will remove this
restriction.

Allow probing such functions by replacing the existing check for
jailhouse_paravirt() with a new hypervisor_isolated_pci_functions() helper.

Link: https://lore.kernel.org/r/20220628143100.3228092-5-schnelle@linux.ibm.com
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
---
 drivers/pci/probe.c        | 2 +-
 include/linux/hypervisor.h | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 1f91ed67b5c5..4948531481d1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2668,7 +2668,7 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
 			 * a hypervisor that passes through individual PCI
 			 * functions.
 			 */
-			if (!jailhouse_paravirt())
+			if (!hypervisor_isolated_pci_functions())
 				break;
 		}
 		fn = next_fn(bus, dev, fn);
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index fc08b433c856..9efbc54e35e5 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -32,4 +32,12 @@ static inline bool jailhouse_paravirt(void)
 
 #endif /* !CONFIG_X86 */
 
+static inline bool hypervisor_isolated_pci_functions(void)
+{
+	if (IS_ENABLED(CONFIG_S390))
+		return true;
+
+	return jailhouse_paravirt();
+}
+
 #endif /* __LINUX_HYPEVISOR_H */
-- 
cgit v1.2.3


From abb4970ac33514c84b143516583eaf8cc47abd67 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 23 Jul 2022 06:49:42 +0900
Subject: PCI: Move isa_dma_bridge_buggy out of asm/dma.h

The isa_dma_bridge_buggy symbol is only used for x86_32, and only x86_32
platforms or quirks ever set it.

Add a new linux/isa-dma.h header that #defines isa_dma_bridge_buggy to 0
except on x86_32, where we keep it as a variable, and remove all the arch-
specific definitions.

[bhelgaas: commit log]
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Link: https://lore.kernel.org/r/20220722214944.831438-3-shorne@gmail.com
Signed-off-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/alpha/include/asm/dma.h           |  9 ---------
 arch/arc/include/asm/dma.h             |  5 -----
 arch/arm/include/asm/dma.h             |  6 ------
 arch/arm64/include/asm/pci.h           |  2 --
 arch/csky/include/asm/pci.h            |  2 --
 arch/ia64/include/asm/dma.h            |  2 --
 arch/m68k/include/asm/dma.h            |  6 ------
 arch/microblaze/include/asm/dma.h      |  6 ------
 arch/mips/include/asm/dma.h            |  8 --------
 arch/parisc/include/asm/dma.h          |  6 ------
 arch/powerpc/include/asm/dma.h         |  6 ------
 arch/riscv/include/asm/pci.h           |  2 --
 arch/s390/include/asm/dma.h            |  6 ------
 arch/sh/include/asm/dma.h              |  6 ------
 arch/sparc/include/asm/dma.h           |  8 --------
 arch/um/include/asm/pci.h              |  2 --
 arch/x86/include/asm/dma.h             |  8 --------
 arch/xtensa/include/asm/dma.h          |  7 -------
 drivers/comedi/drivers/comedi_isadma.c |  2 +-
 drivers/pci/pci.c                      |  2 ++
 drivers/pci/quirks.c                   |  4 +++-
 include/linux/isa-dma.h                | 14 ++++++++++++++
 sound/core/isadma.c                    |  2 +-
 23 files changed, 21 insertions(+), 100 deletions(-)
 create mode 100644 include/linux/isa-dma.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/dma.h b/arch/alpha/include/asm/dma.h
index 28610ea7786d..a04d76b96089 100644
--- a/arch/alpha/include/asm/dma.h
+++ b/arch/alpha/include/asm/dma.h
@@ -365,13 +365,4 @@ extern void free_dma(unsigned int dmanr);	/* release it again */
 #define KERNEL_HAVE_CHECK_DMA
 extern int check_dma(unsigned int dmanr);
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/arc/include/asm/dma.h b/arch/arc/include/asm/dma.h
index 5b744f4b10a7..02431027ed2f 100644
--- a/arch/arc/include/asm/dma.h
+++ b/arch/arc/include/asm/dma.h
@@ -7,10 +7,5 @@
 #define ASM_ARC_DMA_H
 
 #define MAX_DMA_ADDRESS 0xC0000000
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	0
-#endif
 
 #endif
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index a81dda65c576..907d139be431 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -143,10 +143,4 @@ extern int  get_dma_residue(unsigned int chan);
 
 #endif /* CONFIG_ISA_DMA_API */
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy    (0)
-#endif
-
 #endif /* __ASM_ARM_DMA_H */
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index 0aebc3488c32..682c922b5658 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -20,8 +20,6 @@
 #define arch_can_pci_mmap_wc() 1
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h
index 0535f1aaae38..5c02454ec724 100644
--- a/arch/csky/include/asm/pci.h
+++ b/arch/csky/include/asm/pci.h
@@ -15,8 +15,6 @@
 /* C-SKY shim does not initialize PCI bus */
 #define pcibios_assign_all_busses() 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
index 59625e9c1f9c..eaed2626ffda 100644
--- a/arch/ia64/include/asm/dma.h
+++ b/arch/ia64/include/asm/dma.h
@@ -12,8 +12,6 @@
 
 extern unsigned long MAX_DMA_ADDRESS;
 
-extern int isa_dma_bridge_buggy;
-
 #define free_dma(x)
 
 #endif /* _ASM_IA64_DMA_H */
diff --git a/arch/m68k/include/asm/dma.h b/arch/m68k/include/asm/dma.h
index f6c5e0dfb4e5..1c8d9c5bc2fa 100644
--- a/arch/m68k/include/asm/dma.h
+++ b/arch/m68k/include/asm/dma.h
@@ -6,10 +6,4 @@
    bootmem allocator (but this should do it for this) */
 #define MAX_DMA_ADDRESS PAGE_OFFSET
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy    (0)
-#endif
-
 #endif /* _M68K_DMA_H */
diff --git a/arch/microblaze/include/asm/dma.h b/arch/microblaze/include/asm/dma.h
index f801582be912..7484c9eb66c4 100644
--- a/arch/microblaze/include/asm/dma.h
+++ b/arch/microblaze/include/asm/dma.h
@@ -9,10 +9,4 @@
 /* Virtual address corresponding to last available physical memory address.  */
 #define MAX_DMA_ADDRESS (CONFIG_KERNEL_START + memory_size - 1)
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy     (0)
-#endif
-
 #endif /* _ASM_MICROBLAZE_DMA_H */
diff --git a/arch/mips/include/asm/dma.h b/arch/mips/include/asm/dma.h
index be726b943530..d6186e6bea7e 100644
--- a/arch/mips/include/asm/dma.h
+++ b/arch/mips/include/asm/dma.h
@@ -307,12 +307,4 @@ static __inline__ int get_dma_residue(unsigned int dmanr)
 extern int request_dma(unsigned int dmanr, const char * device_id);	/* reserve a DMA channel */
 extern void free_dma(unsigned int dmanr);	/* release it again */
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/parisc/include/asm/dma.h b/arch/parisc/include/asm/dma.h
index eea80ed34e6d..9e8c101de902 100644
--- a/arch/parisc/include/asm/dma.h
+++ b/arch/parisc/include/asm/dma.h
@@ -176,10 +176,4 @@ static __inline__ void set_dma_count(unsigned int dmanr, unsigned int count)
 
 #define free_dma(dmanr)
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
 #endif /* _ASM_DMA_H */
diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h
index 6161a9596196..d97c66d9ae34 100644
--- a/arch/powerpc/include/asm/dma.h
+++ b/arch/powerpc/include/asm/dma.h
@@ -340,11 +340,5 @@ extern int request_dma(unsigned int dmanr, const char *device_id);
 /* release it again */
 extern void free_dma(unsigned int dmanr);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_DMA_H */
diff --git a/arch/riscv/include/asm/pci.h b/arch/riscv/include/asm/pci.h
index a7b8f0d0df7f..f904df586c03 100644
--- a/arch/riscv/include/asm/pci.h
+++ b/arch/riscv/include/asm/pci.h
@@ -20,8 +20,6 @@
 
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/s390/include/asm/dma.h b/arch/s390/include/asm/dma.h
index 6f26f35d4a71..dec1c4ce628c 100644
--- a/arch/s390/include/asm/dma.h
+++ b/arch/s390/include/asm/dma.h
@@ -11,10 +11,4 @@
  */
 #define MAX_DMA_ADDRESS         0x80000000
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_S390_DMA_H */
diff --git a/arch/sh/include/asm/dma.h b/arch/sh/include/asm/dma.h
index 17d23ae98c77..c8bee3f985a2 100644
--- a/arch/sh/include/asm/dma.h
+++ b/arch/sh/include/asm/dma.h
@@ -137,10 +137,4 @@ extern int register_chan_caps(const char *dmac, struct dma_chan_caps *capslist);
 extern int dma_create_sysfs_files(struct dma_channel *, struct dma_info *);
 extern void dma_remove_sysfs_files(struct dma_channel *, struct dma_info *);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* __ASM_SH_DMA_H */
diff --git a/arch/sparc/include/asm/dma.h b/arch/sparc/include/asm/dma.h
index 462e7c794a09..08043f35b110 100644
--- a/arch/sparc/include/asm/dma.h
+++ b/arch/sparc/include/asm/dma.h
@@ -82,14 +82,6 @@
 #define DMA_BURST64      0x40
 #define DMA_BURSTBITS    0x7f
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
 #ifdef CONFIG_SPARC32
 struct device;
 
diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h
index 26b96c02ef61..1211855aff34 100644
--- a/arch/um/include/asm/pci.h
+++ b/arch/um/include/asm/pci.h
@@ -9,8 +9,6 @@
 
 #define pcibios_assign_all_busses() 1
 
-extern int isa_dma_bridge_buggy;
-
 #ifdef CONFIG_PCI_DOMAINS
 static inline int pci_proc_domain(struct pci_bus *bus)
 {
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 8e95aa4b0d17..8ae6e0e11b8b 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id);
 extern void free_dma(unsigned int dmanr);
 #endif
 
-/* From PCI */
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy	(0)
-#endif
-
 #endif /* _ASM_X86_DMA_H */
diff --git a/arch/xtensa/include/asm/dma.h b/arch/xtensa/include/asm/dma.h
index bb099a373b5a..172644539032 100644
--- a/arch/xtensa/include/asm/dma.h
+++ b/arch/xtensa/include/asm/dma.h
@@ -52,11 +52,4 @@
 extern int request_dma(unsigned int dmanr, const char * device_id);
 extern void free_dma(unsigned int dmanr);
 
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#else
-#define isa_dma_bridge_buggy 	(0)
-#endif
-
-
 #endif
diff --git a/drivers/comedi/drivers/comedi_isadma.c b/drivers/comedi/drivers/comedi_isadma.c
index 700982464c53..020b3d1e1ac0 100644
--- a/drivers/comedi/drivers/comedi_isadma.c
+++ b/drivers/comedi/drivers/comedi_isadma.c
@@ -8,7 +8,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
-#include <asm/dma.h>
+#include <linux/isa-dma.h>
 #include <linux/comedi/comedidev.h>
 #include <linux/comedi/comedi_isadma.h>
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cfaf40a540a8..60c55d2cb2cc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -41,8 +41,10 @@ const char *pci_power_names[] = {
 };
 EXPORT_SYMBOL_GPL(pci_power_names);
 
+#ifdef CONFIG_X86_32
 int isa_dma_bridge_buggy;
 EXPORT_SYMBOL(isa_dma_bridge_buggy);
+#endif
 
 int pci_pci_problems;
 EXPORT_SYMBOL(pci_pci_problems);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 41aeaa235132..6fc64509eee7 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/pci.h>
+#include <linux/isa-dma.h> /* isa_dma_bridge_buggy */
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/acpi.h>
@@ -30,7 +31,6 @@
 #include <linux/pm_runtime.h>
 #include <linux/suspend.h>
 #include <linux/switchtec.h>
-#include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
 static ktime_t fixup_debug_start(struct pci_dev *dev,
@@ -239,6 +239,7 @@ static void quirk_passive_release(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_passive_release);
 DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82441,	quirk_passive_release);
 
+#ifdef CONFIG_X86_32
 /*
  * The VIA VP2/VP3/MVP3 seem to have some 'features'. There may be a
  * workaround but VIA don't answer queries. If you happen to have good
@@ -265,6 +266,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AL,	PCI_DEVICE_ID_AL_M1533,		quirk_isa_dma
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_1,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_2,	quirk_isa_dma_hangs);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NEC,	PCI_DEVICE_ID_NEC_CBUS_3,	quirk_isa_dma_hangs);
+#endif
 
 /*
  * Intel NM10 "TigerPoint" LPC PM1a_STS.BM_STS must be clear
diff --git a/include/linux/isa-dma.h b/include/linux/isa-dma.h
new file mode 100644
index 000000000000..61504a8c1b9e
--- /dev/null
+++ b/include/linux/isa-dma.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_ISA_DMA_H
+#define __LINUX_ISA_DMA_H
+
+#include <asm/dma.h>
+
+#if defined(CONFIG_PCI) && defined(CONFIG_X86_32)
+extern int isa_dma_bridge_buggy;
+#else
+#define isa_dma_bridge_buggy	(0)
+#endif
+
+#endif /* __LINUX_ISA_DMA_H */
diff --git a/sound/core/isadma.c b/sound/core/isadma.c
index 1f45ede023b4..18a86212e3a8 100644
--- a/sound/core/isadma.c
+++ b/sound/core/isadma.c
@@ -12,8 +12,8 @@
 #undef HAVE_REALLY_SLOW_DMA_CONTROLLER
 
 #include <linux/export.h>
+#include <linux/isa-dma.h>
 #include <sound/core.h>
-#include <asm/dma.h>
 
 /**
  * snd_dma_program - program an ISA DMA transfer
-- 
cgit v1.2.3


From e8f90717ed3b58e81c480b3aa38e641c0da5a456 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:47 -0700
Subject: vfio: Make vfio_unpin_pages() return void

There's only one caller that checks its return value with a WARN_ON_ONCE,
while all other callers don't check the return value at all. Above that,
an undo function should not fail. So, simplify the API to return void by
embedding similar WARN_ONs.

Also for users to pinpoint which condition fails, separate WARN_ON lines,
yet remove the "driver->ops->unpin_pages" check, since it's unreasonable
for callers to unpin on something totally random that wasn't even pinned.
And remove NULL pointer checks for they would trigger oops vs. warnings.
Note that npage is already validated in the vfio core, thus drop the same
check in the type1 code.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-2-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  2 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c                  |  5 +----
 drivers/vfio/vfio.c                               | 21 +++++++--------------
 drivers/vfio/vfio.h                               |  2 +-
 drivers/vfio/vfio_iommu_type1.c                   | 15 ++++++---------
 include/linux/vfio.h                              |  4 ++--
 6 files changed, 18 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 1c57815619fd..b0fdf76b339a 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -265,7 +265,7 @@ driver::
 	int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 				  int npage, int prot, unsigned long *phys_pfn);
 
-	int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 				    int npage);
 
 These functions call back into the back-end IOMMU module by using the pin_pages
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index ecd5bb37b63a..4d32a2748958 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -231,18 +231,15 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size)
 {
-	struct drm_i915_private *i915 = vgpu->gvt->gt->i915;
 	int total_pages;
 	int npage;
-	int ret;
 
 	total_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
 
 	for (npage = 0; npage < total_pages; npage++) {
 		unsigned long cur_gfn = gfn + npage;
 
-		ret = vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
-		drm_WARN_ON(&i915->drm, ret != 1);
+		vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
 	}
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index b3ce8073cfb1..92b10aafae28 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1983,31 +1983,24 @@ EXPORT_SYMBOL(vfio_pin_pages);
  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @npage [in]   : count of elements in user_pfn array.  This count should not
  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- * Return error or number of pages unpinned.
  */
-int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		     int npage)
+void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		      int npage)
 {
 	struct vfio_container *container;
 	struct vfio_iommu_driver *driver;
-	int ret;
 
-	if (!user_pfn || !npage || !vfio_assert_device_open(device))
-		return -EINVAL;
+	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
+		return;
 
-	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
-		return -E2BIG;
+	if (WARN_ON(!vfio_assert_device_open(device)))
+		return;
 
 	/* group->container cannot change while a vfio device is open */
 	container = device->group->container;
 	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->unpin_pages))
-		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
-					       npage);
-	else
-		ret = -ENOTTY;
 
-	return ret;
+	driver->ops->unpin_pages(container->iommu_data, user_pfn, npage);
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 4a7db1f3c33e..6a8424b407c7 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -53,7 +53,7 @@ struct vfio_iommu_driver_ops {
 				     unsigned long *user_pfn,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
-	int		(*unpin_pages)(void *iommu_data,
+	void		(*unpin_pages)(void *iommu_data,
 				       unsigned long *user_pfn, int npage);
 	void		(*register_device)(void *iommu_data,
 					   struct vfio_device *vdev);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 026a1d2553a2..e49fbe9968ef 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -949,20 +949,16 @@ pin_done:
 	return ret;
 }
 
-static int vfio_iommu_type1_unpin_pages(void *iommu_data,
-					unsigned long *user_pfn,
-					int npage)
+static void vfio_iommu_type1_unpin_pages(void *iommu_data,
+					 unsigned long *user_pfn, int npage)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	bool do_accounting;
 	int i;
 
-	if (!iommu || !user_pfn || npage <= 0)
-		return -EINVAL;
-
 	/* Supported for v2 version only */
-	if (!iommu->v2)
-		return -EACCES;
+	if (WARN_ON(!iommu->v2))
+		return;
 
 	mutex_lock(&iommu->lock);
 
@@ -980,7 +976,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 	}
 
 	mutex_unlock(&iommu->lock);
-	return i > 0 ? i : -EINVAL;
+
+	WARN_ON(i != npage);
 }
 
 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 19cefbaa3d06..9f7d74c24925 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -163,8 +163,8 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 		   int npage, int prot, unsigned long *phys_pfn);
-int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		     int npage);
+void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+		      int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-- 
cgit v1.2.3


From ba8ec7a607e98e8491a1fcf924a2e6c96ac9d413 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 23 Jul 2022 14:47:28 -0400
Subject: SUNRPC: Shrink size of struct rpc_task

Move the field 'tk_rpc_status' so that we eliminate a 4 byte hole in the
structure.
For x86_64, this shrinks the size of the struct by 8 bytes.

'pahole' output before the change:
        /* size: 232, cachelines: 4, members: 27 */
        /* sum members: 222, holes: 1, sum holes: 4 */
        /* sum bitfield members: 8 bits (1 bytes) */
        /* padding: 5 */
        /* last cacheline: 40 bytes */

'pahole' output after the change:
        /* size: 224, cachelines: 4, members: 27 */
        /* padding: 1 */
        /* last cacheline: 32 bytes */

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 1d7a3e51b795..acc62647317c 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -61,8 +61,6 @@ struct rpc_task {
 		struct rpc_wait		tk_wait;	/* RPC wait */
 	} u;
 
-	int			tk_rpc_status;	/* Result of last RPC operation */
-
 	/*
 	 * RPC call state
 	 */
@@ -82,6 +80,8 @@ struct rpc_task {
 	ktime_t			tk_start;	/* RPC task init timestamp */
 
 	pid_t			tk_owner;	/* Process id for batching tasks */
+
+	int			tk_rpc_status;	/* Result of last RPC operation */
 	unsigned short		tk_flags;	/* misc flags */
 	unsigned short		tk_timeouts;	/* maj timeouts */
 
-- 
cgit v1.2.3


From 69d966510d9f5de81588b37d23a9ee8ccc477b23 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 22 Jul 2022 14:12:20 -0400
Subject: nfs: only issue commit in DIO codepath if we have uncommitted data

Currently, we try to determine whether to issue a commit based on
nfs_write_need_commit which looks at the current verifier. In the case
where we got a short write and then tried to follow it up with one that
failed, the verifier can't be trusted.

What we really want to know is whether the pgio request had any
successful writes that came back as UNSTABLE. Add a new flag to the pgio
request, and use that to indicate that we've had a successful unstable
write. Only issue a commit if that flag is set.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c         |  2 +-
 fs/nfs/write.c          | 48 ++++++++++++++++++++++++++++++------------------
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 32 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index a47d13296194..86df66bb14c5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -690,7 +690,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 	}
 
 	nfs_direct_count_bytes(dreq, hdr);
-	if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+	if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
 		if (!dreq->flags)
 			dreq->flags = NFS_ODIRECT_DO_COMMIT;
 		flags = dreq->flags;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1c706465d090..16d166bc4099 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1576,25 +1576,37 @@ static int nfs_writeback_done(struct rpc_task *task,
 	nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 	trace_nfs_writeback_done(task, hdr);
 
-	if (hdr->res.verf->committed < hdr->args.stable &&
-	    task->tk_status >= 0) {
-		/* We tried a write call, but the server did not
-		 * commit data to stable storage even though we
-		 * requested it.
-		 * Note: There is a known bug in Tru64 < 5.0 in which
-		 *	 the server reports NFS_DATA_SYNC, but performs
-		 *	 NFS_FILE_SYNC. We therefore implement this checking
-		 *	 as a dprintk() in order to avoid filling syslog.
-		 */
-		static unsigned long    complain;
+	if (task->tk_status >= 0) {
+		enum nfs3_stable_how committed = hdr->res.verf->committed;
+
+		if (committed == NFS_UNSTABLE) {
+			/*
+			 * We have some uncommitted data on the server at
+			 * this point, so ensure that we keep track of that
+			 * fact irrespective of what later writes do.
+			 */
+			set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags);
+		}
 
-		/* Note this will print the MDS for a DS write */
-		if (time_before(complain, jiffies)) {
-			dprintk("NFS:       faulty NFS server %s:"
-				" (committed = %d) != (stable = %d)\n",
-				NFS_SERVER(inode)->nfs_client->cl_hostname,
-				hdr->res.verf->committed, hdr->args.stable);
-			complain = jiffies + 300 * HZ;
+		if (committed < hdr->args.stable) {
+			/* We tried a write call, but the server did not
+			 * commit data to stable storage even though we
+			 * requested it.
+			 * Note: There is a known bug in Tru64 < 5.0 in which
+			 *	 the server reports NFS_DATA_SYNC, but performs
+			 *	 NFS_FILE_SYNC. We therefore implement this checking
+			 *	 as a dprintk() in order to avoid filling syslog.
+			 */
+			static unsigned long    complain;
+
+			/* Note this will print the MDS for a DS write */
+			if (time_before(complain, jiffies)) {
+				dprintk("NFS:       faulty NFS server %s:"
+					" (committed = %d) != (stable = %d)\n",
+					NFS_SERVER(inode)->nfs_client->cl_hostname,
+					committed, hdr->args.stable);
+				complain = jiffies + 300 * HZ;
+			}
 		}
 	}
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0e3aa0f5f324..e86cf6642d21 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1600,6 +1600,7 @@ enum {
 	NFS_IOHDR_STAT,
 	NFS_IOHDR_RESEND_PNFS,
 	NFS_IOHDR_RESEND_MDS,
+	NFS_IOHDR_UNSTABLE_WRITES,
 };
 
 struct nfs_io_completion;
-- 
cgit v1.2.3


From 4f5f3b6028343d687d0533329b130e4b8280ab32 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:31 -0400
Subject: SUNRPC: Introduce xdr_stream_move_subsegment()

I do this by creating an xdr subsegment for the range we will be
operating over. This lets me shift data to the correct place without
potentially overwriting anything already there.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5860f32e3958..7dcc6c31fe29 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -262,6 +262,8 @@ extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, uns
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 				  unsigned int len);
+extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
+					       unsigned int target, unsigned int length);
 
 /**
  * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5d2b3e6979fb..8ba11a754297 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -775,6 +775,34 @@ static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
 	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
 }
 
+static void xdr_buf_head_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+	unsigned int bytes;
+
+	if (!shift || !len)
+		return;
+
+	if (shift > base) {
+		bytes = (shift - base);
+		if (bytes >= len)
+			return;
+		base += bytes;
+		len -= bytes;
+	}
+
+	if (base < head->iov_len) {
+		bytes = min_t(unsigned int, len, head->iov_len - base);
+		memmove(head->iov_base + (base - shift),
+			head->iov_base + base, bytes);
+		base += bytes;
+		len -= bytes;
+	}
+	xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
@@ -1680,6 +1708,37 @@ bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 }
 EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
 
+/**
+ * xdr_stream_move_subsegment - Move part of a stream to another position
+ * @xdr: the source xdr_stream
+ * @offset: the source offset of the segment
+ * @target: the target offset of the segment
+ * @length: the number of bytes to move
+ *
+ * Moves @length bytes from @offset to @target in the xdr_stream, overwriting
+ * anything in its space. Returns the number of bytes in the segment.
+ */
+unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
+					unsigned int target, unsigned int length)
+{
+	struct xdr_buf buf;
+	unsigned int shift;
+
+	if (offset < target) {
+		shift = target - offset;
+		if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0)
+			return 0;
+		xdr_buf_head_shift_right(&buf, 0, length, shift);
+	} else if (offset > target) {
+		shift = offset - target;
+		if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0)
+			return 0;
+		xdr_buf_head_shift_left(&buf, shift, length, shift);
+	}
+	return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment);
+
 /**
  * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
  * @buf: buf to be trimmed
-- 
cgit v1.2.3


From 7c4cd5f4d2dd4a028a46bfb696b0cd387caadf33 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:32 -0400
Subject: SUNRPC: Add a function for directly setting the xdr page len

We need to do this step during READ_PLUS decoding so that we know pages
are the right length and any extra data has been preserved in the tail.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  1 +
 net/sunrpc/xdr.c           | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 7dcc6c31fe29..8cd38a9994ca 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -258,6 +258,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8ba11a754297..e4ac700ca554 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1500,6 +1500,36 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
+/**
+ * xdr_set_pagelen - Sets the length of the XDR pages
+ * @xdr: pointer to xdr_stream struct
+ * @len: new length of the XDR page data
+ *
+ * Either grows or shrinks the length of the xdr pages by setting pagelen to
+ * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas
+ * when growing any data beyond the current pointer is moved into the tail.
+ *
+ * Returns True if the operation was successful, and False otherwise.
+ */
+void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+	size_t remaining = xdr_stream_remaining(xdr);
+	size_t base = 0;
+
+	if (len < buf->page_len) {
+		base = buf->page_len - len;
+		xdr_shrink_pagelen(buf, len);
+	} else {
+		xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr),
+					 buf->page_len, remaining);
+		if (len > buf->page_len)
+			xdr_buf_try_expand(buf, len - buf->page_len);
+	}
+	xdr_set_tail_base(xdr, base, remaining);
+}
+EXPORT_SYMBOL_GPL(xdr_set_pagelen);
+
 unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 			    unsigned int length)
 {
-- 
cgit v1.2.3


From e1bd87608d4b6f87813f79b91e834de610f1049b Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:33 -0400
Subject: SUNRPC: Add a function for zeroing out a portion of an xdr_stream

This will be used during READ_PLUS decoding for handling HOLE segments.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8cd38a9994ca..f0ab06acab61 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -265,6 +265,8 @@ extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf
 				  unsigned int len);
 extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
 					       unsigned int target, unsigned int length);
+extern unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset,
+				    unsigned int length);
 
 /**
  * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e4ac700ca554..f09a7ab1a82b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1769,6 +1769,29 @@ unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int off
 }
 EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment);
 
+/**
+ * xdr_stream_zero - zero out a portion of an xdr_stream
+ * @xdr: an xdr_stream to zero out
+ * @offset: the starting point in the stream
+ * @length: the number of bytes to zero
+ */
+unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
+{
+	struct xdr_buf buf;
+
+	if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0)
+		return 0;
+	if (buf.head[0].iov_len)
+		xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len);
+	if (buf.page_len > 0)
+		xdr_buf_pages_zero(&buf, 0, buf.page_len);
+	if (buf.tail[0].iov_len)
+		xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len);
+	return length;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_zero);
+
 /**
  * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
  * @buf: buf to be trimmed
-- 
cgit v1.2.3


From 29946fbcb2c31a2a367887dc58a2e7e5b012e285 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Thu, 21 Jul 2022 14:21:35 -0400
Subject: SUNRPC: Remove xdr_align_data() and xdr_expand_hole()

These functions are no longer needed now that the NFS client places data
and hole segments directly.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |  2 --
 net/sunrpc/xdr.c           | 66 ----------------------------------------------
 2 files changed, 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index f0ab06acab61..f38c97f45354 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -259,8 +259,6 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len);
-extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
 				  unsigned int len);
 extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f09a7ab1a82b..482586c23fdd 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1530,72 +1530,6 @@ void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_set_pagelen);
 
-unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
-			    unsigned int length)
-{
-	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes, len;
-	unsigned int shift;
-
-	xdr_realign_pages(xdr);
-	from = xdr_page_pos(xdr);
-
-	if (from >= buf->page_len + buf->tail->iov_len)
-		return 0;
-	if (from + buf->head->iov_len >= buf->len)
-		return 0;
-
-	len = buf->len - buf->head->iov_len;
-
-	/* We only shift data left! */
-	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
-		      from, offset))
-		return 0;
-	if (WARN_ONCE(offset > buf->page_len,
-		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
-		      offset, buf->page_len))
-		return 0;
-
-	/* Move page data to the left */
-	shift = from - offset;
-	xdr_buf_pages_shift_left(buf, from, len, shift);
-
-	bytes = xdr_stream_remaining(xdr);
-	if (length > bytes)
-		length = bytes;
-	bytes -= length;
-
-	xdr->buf->len -= shift;
-	xdr_set_page(xdr, offset + length, bytes);
-	return length;
-}
-EXPORT_SYMBOL_GPL(xdr_align_data);
-
-unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
-			     unsigned int length)
-{
-	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, to, shift;
-
-	xdr_realign_pages(xdr);
-	from = xdr_page_pos(xdr);
-	to = xdr_align_size(offset + length);
-
-	/* Could the hole be behind us? */
-	if (to > from) {
-		unsigned int buflen = buf->len - buf->head->iov_len;
-		shift = to - from;
-		xdr_buf_try_expand(buf, shift);
-		xdr_buf_pages_shift_right(buf, from, buflen, shift);
-		xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
-	} else if (to != from)
-		xdr_align_data(xdr, to, 0);
-	xdr_buf_pages_zero(buf, offset, length);
-
-	return length;
-}
-EXPORT_SYMBOL_GPL(xdr_expand_hole);
-
 /**
  * xdr_enter_page - decode data from the XDR page
  * @xdr: pointer to xdr_stream struct
-- 
cgit v1.2.3


From 7ffcdaa670164a2ad3844a5ef6df5423782ba290 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:21 -0400
Subject: SUNRPC expose functions for offline remote xprt functionality

Re-arrange the code that make offline transport and delete transport
callable functions.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  3 +++
 net/sunrpc/sysfs.c          | 28 +++++-----------------------
 net/sunrpc/xprt.c           | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 522bbf937957..0d51b9f9ea37 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -505,4 +505,7 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
 	return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
+void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
+void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps);
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index a3a2f8aeb80e..7330eb9a70cf 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -314,32 +314,14 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj,
 		goto release_tasks;
 	}
 	if (offline) {
-		if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) {
-			spin_lock(&xps->xps_lock);
-			xps->xps_nactive--;
-			spin_unlock(&xps->xps_lock);
-		}
+		xprt_set_offline_locked(xprt, xps);
 	} else if (online) {
-		if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) {
-			spin_lock(&xps->xps_lock);
-			xps->xps_nactive++;
-			spin_unlock(&xps->xps_lock);
-		}
+		xprt_set_online_locked(xprt, xps);
 	} else if (remove) {
-		if (test_bit(XPRT_OFFLINE, &xprt->state)) {
-			if (!test_and_set_bit(XPRT_REMOVE, &xprt->state)) {
-				xprt_force_disconnect(xprt);
-				if (test_bit(XPRT_CONNECTED, &xprt->state)) {
-					if (!xprt->sending.qlen &&
-					    !xprt->pending.qlen &&
-					    !xprt->backlog.qlen &&
-					    !atomic_long_read(&xprt->queuelen))
-						rpc_xprt_switch_remove_xprt(xps, xprt);
-				}
-			}
-		} else {
+		if (test_bit(XPRT_OFFLINE, &xprt->state))
+			xprt_delete_locked(xprt, xps);
+		else
 			count = -EINVAL;
-		}
 	}
 
 release_tasks:
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 86d62cffba0d..8f8e3c952f24 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -2152,3 +2152,35 @@ void xprt_put(struct rpc_xprt *xprt)
 		kref_put(&xprt->kref, xprt_destroy_kref);
 }
 EXPORT_SYMBOL_GPL(xprt_put);
+
+void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) {
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive--;
+		spin_unlock(&xps->xps_lock);
+	}
+}
+
+void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) {
+		spin_lock(&xps->xps_lock);
+		xps->xps_nactive++;
+		spin_unlock(&xps->xps_lock);
+	}
+}
+
+void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
+{
+	if (test_and_set_bit(XPRT_REMOVE, &xprt->state))
+		return;
+
+	xprt_force_disconnect(xprt);
+	if (!test_bit(XPRT_CONNECTED, &xprt->state))
+		return;
+
+	if (!xprt->sending.qlen && !xprt->pending.qlen &&
+	    !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen))
+		rpc_xprt_switch_remove_xprt(xps, xprt);
+}
-- 
cgit v1.2.3


From 895245ccea251ff54ea19bc364c9a49007918115 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:22 -0400
Subject: SUNRPC add function to offline remove trunkable transports

Iterate thru available transports in the xprt_switch for all
trunkable transports offline and possibly remote them as well.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 90501404fa49..d14333f4947a 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -234,6 +234,7 @@ int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 			struct rpc_xprt_switch *,
 			struct rpc_xprt *,
 			void *);
+void		rpc_clnt_manage_trunked_xprts(struct rpc_clnt *);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a97d4e06cae3..2b079c4d8af1 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3000,6 +3000,52 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int rpc_xprt_offline(struct rpc_clnt *clnt,
+			    struct rpc_xprt *xprt,
+			    void *data)
+{
+	struct rpc_xprt *main_xprt;
+	struct rpc_xprt_switch *xps;
+	int err = 0;
+
+	xprt_get(xprt);
+
+	rcu_read_lock();
+	main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+	err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+				(struct sockaddr *)&main_xprt->addr);
+	rcu_read_unlock();
+	xprt_put(main_xprt);
+	if (err)
+		goto out;
+
+	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) {
+		err = -EINTR;
+		goto out;
+	}
+	xprt_set_offline_locked(xprt, xps);
+
+	xprt_release_write(xprt, NULL);
+out:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	return err;
+}
+
+/* rpc_clnt_manage_trunked_xprts -- offline trunked transports
+ * @clnt rpc_clnt structure
+ *
+ * For each active transport found in the rpc_clnt structure call
+ * the function rpc_xprt_offline() which will identify trunked transports
+ * and will mark them offline.
+ */
+void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt)
+{
+	rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts);
+
 struct connect_timeout_data {
 	unsigned long connect_timeout;
 	unsigned long reconnect_timeout;
-- 
cgit v1.2.3


From 95d0d30c66b855f614e677b8cd0455eed0765a6f Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:24 -0400
Subject: SUNRPC create an iterator to list only OFFLINE xprts

Create a new iterator helper that will go thru the all the transports
in the switch and return transports that are marked OFFLINE.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h |  3 ++
 net/sunrpc/clnt.c                    | 11 +++-
 net/sunrpc/xprtmultipath.c           | 99 ++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index bbb8a5fa0816..688ca7eb1d01 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -63,6 +63,9 @@ extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
 extern void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *xps);
 
+extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps);
+
 extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi);
 
 extern struct rpc_xprt_switch *xprt_iter_xchg_switch(
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2b079c4d8af1..68021b70340d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -786,7 +786,8 @@ out_revert:
 EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
 
 static
-int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi,
+			     void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps))
 {
 	struct rpc_xprt_switch *xps;
 
@@ -795,11 +796,17 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
 	rcu_read_unlock();
 	if (xps == NULL)
 		return -EAGAIN;
-	xprt_iter_init_listall(xpi, xps);
+	func(xpi, xps);
 	xprt_switch_put(xps);
 	return 0;
 }
 
+static
+int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+{
+	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall);
+}
+
 /**
  * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
  * @clnt: pointer to client
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 1693f81aae37..8def8423fc0a 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -27,6 +27,7 @@ typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin;
 static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall;
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline;
 
 static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt)
@@ -248,6 +249,18 @@ struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
 	return NULL;
 }
 
+static
+struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head)
+{
+	struct rpc_xprt *pos;
+
+	list_for_each_entry_rcu(pos, head, xprt_switch) {
+		if (!xprt_is_active(pos))
+			return pos;
+	}
+	return NULL;
+}
+
 static
 struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
 {
@@ -259,8 +272,9 @@ struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
 }
 
 static
-struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
-		const struct rpc_xprt *cur)
+struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head,
+						 const struct rpc_xprt *cur,
+						 bool find_active)
 {
 	struct rpc_xprt *pos;
 	bool found = false;
@@ -268,14 +282,25 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
 	list_for_each_entry_rcu(pos, head, xprt_switch) {
 		if (cur == pos)
 			found = true;
-		if (found && xprt_is_active(pos))
+		if (found && ((find_active && xprt_is_active(pos)) ||
+			      (!find_active && xprt_is_active(pos))))
 			return pos;
 	}
 	return NULL;
 }
 
 static
-struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
+						const struct rpc_xprt *cur)
+{
+	return _xprt_switch_find_current_entry(head, cur, true);
+}
+
+static
+struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt *first_entry(struct list_head *head),
+		struct rpc_xprt *current_entry(struct list_head *head,
+					       const struct rpc_xprt *cur))
 {
 	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
 	struct list_head *head;
@@ -284,8 +309,30 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
 		return NULL;
 	head = &xps->xps_xprt_list;
 	if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2)
-		return xprt_switch_find_first_entry(head);
-	return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
+		return first_entry(head);
+	return current_entry(head, xpi->xpi_cursor);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+{
+	return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry,
+			xprt_switch_find_current_entry);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head,
+		const struct rpc_xprt *cur)
+{
+	return _xprt_switch_find_current_entry(head, cur, false);
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi)
+{
+	return _xprt_iter_current_entry(xpi,
+			xprt_switch_find_first_entry_offline,
+			xprt_switch_find_current_entry_offline);
 }
 
 bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
@@ -310,7 +357,7 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
 
 static
 struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
-		const struct rpc_xprt *cur)
+		const struct rpc_xprt *cur, bool check_active)
 {
 	struct rpc_xprt *pos, *prev = NULL;
 	bool found = false;
@@ -318,7 +365,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
 	list_for_each_entry_rcu(pos, head, xprt_switch) {
 		if (cur == prev)
 			found = true;
-		if (found && xprt_is_active(pos))
+		/* for request to return active transports return only
+		 * active, for request to return offline transports
+		 * return only offline
+		 */
+		if (found && ((check_active && xprt_is_active(pos)) ||
+			      (!check_active && !xprt_is_active(pos))))
 			return pos;
 		prev = pos;
 	}
@@ -355,7 +407,7 @@ struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head
 {
 	struct rpc_xprt *ret;
 
-	ret = xprt_switch_find_next_entry(head, cur);
+	ret = xprt_switch_find_next_entry(head, cur, true);
 	if (ret != NULL)
 		return ret;
 	return xprt_switch_find_first_entry(head);
@@ -397,7 +449,14 @@ static
 struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps,
 		const struct rpc_xprt *cur)
 {
-	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur);
+	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps,
+		const struct rpc_xprt *cur)
+{
+	return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false);
 }
 
 static
@@ -407,6 +466,13 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
 			xprt_switch_find_next_entry_all);
 }
 
+static
+struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi)
+{
+	return xprt_iter_next_entry_multiple(xpi,
+			xprt_switch_find_next_entry_offline);
+}
+
 /*
  * xprt_iter_rewind - Resets the xprt iterator
  * @xpi: pointer to rpc_xprt_iter
@@ -460,6 +526,12 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
 	__xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall);
 }
 
+void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
+		struct rpc_xprt_switch *xps)
+{
+	__xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline);
+}
+
 /**
  * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch
  * @xpi: pointer to rpc_xprt_iter
@@ -574,3 +646,10 @@ const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = {
 	.xpi_xprt = xprt_iter_current_entry,
 	.xpi_next = xprt_iter_next_entry_all,
 };
+
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = {
+	.xpi_rewind = xprt_iter_default_rewind,
+	.xpi_xprt = xprt_iter_current_entry_offline,
+	.xpi_next = xprt_iter_next_entry_offline,
+};
-- 
cgit v1.2.3


From 9368fd6c75053630e95a6dbd17c9522e82101276 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Mon, 25 Jul 2022 09:32:25 -0400
Subject: SUNRPC enable back offline transports in trunking discovery

When we are adding a transport to a xprt_switch that's already on
the list but has been marked OFFLINE, then make the state ONLINE
since it's been tested now.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index d14333f4947a..71a3a1dd7e81 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -242,6 +242,7 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
+void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
 void rpc_cleanup_clids(void);
 
 static inline int rpc_reply_expected(struct rpc_task *task)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 68021b70340d..9dbce3b0d3a2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3095,8 +3095,22 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
 
+void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	rcu_read_unlock();
+	xprt_set_online_locked(xprt, xps);
+}
+
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
+	if (rpc_clnt_xprt_switch_has_addr(clnt,
+		(const struct sockaddr *)&xprt->addr)) {
+		return rpc_clnt_xprt_set_online(clnt, xprt);
+	}
 	rcu_read_lock();
 	rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
 				 xprt);
-- 
cgit v1.2.3


From 497e6464d6adcee64f071b18fc826e63cfd2f0a5 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:26 -0400
Subject: SUNRPC create an rpc function that allows xprt removal from rpc_clnt

Expose a function that allows a removal of xprt from the rpc_clnt.

When called from NFS that's running a trunked transport then don't
decrement the active transport counter.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h          |  1 +
 include/linux/sunrpc/xprtmultipath.h |  2 +-
 net/sunrpc/clnt.c                    | 16 +++++++++++++++-
 net/sunrpc/xprt.c                    |  2 +-
 net/sunrpc/xprtmultipath.c           | 11 ++++++-----
 5 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 71a3a1dd7e81..7a43fd514398 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -240,6 +240,7 @@ const char *rpc_proc_name(const struct rpc_task *task);
 
 void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
 void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt);
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 688ca7eb1d01..9fff0768d942 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -55,7 +55,7 @@ extern void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps);
 extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 		struct rpc_xprt *xprt);
 extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt);
+		struct rpc_xprt *xprt, bool offline);
 
 extern void xprt_iter_init(struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *xps);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9dbce3b0d3a2..26f3102500bb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2144,7 +2144,8 @@ call_connect_status(struct rpc_task *task)
 				xprt_release(task);
 				value = atomic_long_dec_return(&xprt->queuelen);
 				if (value == 0)
-					rpc_xprt_switch_remove_xprt(xps, saved);
+					rpc_xprt_switch_remove_xprt(xps, saved,
+								    true);
 				xprt_put(saved);
 				task->tk_xprt = NULL;
 				task->tk_action = call_start;
@@ -3118,6 +3119,19 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
 
+void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+				    xprt, 0);
+	xps->xps_nunique_destaddr_xprts--;
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt);
+
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 				   const struct sockaddr *sap)
 {
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8f8e3c952f24..44348c9f4b00 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -2182,5 +2182,5 @@ void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps)
 
 	if (!xprt->sending.qlen && !xprt->pending.qlen &&
 	    !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen))
-		rpc_xprt_switch_remove_xprt(xps, xprt);
+		rpc_xprt_switch_remove_xprt(xps, xprt, true);
 }
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8def8423fc0a..55da01730311 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -62,11 +62,11 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 }
 
 static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt)
+		struct rpc_xprt *xprt, bool offline)
 {
 	if (unlikely(xprt == NULL))
 		return;
-	if (!test_bit(XPRT_OFFLINE, &xprt->state))
+	if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline)
 		xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
@@ -79,14 +79,15 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
  * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch
  * @xps: pointer to struct rpc_xprt_switch
  * @xprt: pointer to struct rpc_xprt
+ * @offline: indicates if the xprt that's being removed is in an offline state
  *
  * Removes xprt from the list of struct rpc_xprt in xps.
  */
 void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
-		struct rpc_xprt *xprt)
+		struct rpc_xprt *xprt, bool offline)
 {
 	spin_lock(&xps->xps_lock);
-	xprt_switch_remove_xprt_locked(xps, xprt);
+	xprt_switch_remove_xprt_locked(xps, xprt, offline);
 	spin_unlock(&xps->xps_lock);
 	xprt_put(xprt);
 }
@@ -155,7 +156,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
 
 		xprt = list_first_entry(&xps->xps_xprt_list,
 				struct rpc_xprt, xprt_switch);
-		xprt_switch_remove_xprt_locked(xps, xprt);
+		xprt_switch_remove_xprt_locked(xps, xprt, true);
 		spin_unlock(&xps->xps_lock);
 		xprt_put(xprt);
 		spin_lock(&xps->xps_lock);
-- 
cgit v1.2.3


From 273d6aed9e5a1859dda15256f45561315c3d237a Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:29 -0400
Subject: SUNRPC export xprt_iter_rewind function

Make xprt_iter_rewind callable outside of xprtmultipath.c

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtmultipath.h | 2 ++
 net/sunrpc/xprtmultipath.c           | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 9fff0768d942..c0514c684b2c 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -68,6 +68,8 @@ extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi,
 
 extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi);
 
+extern void xprt_iter_rewind(struct rpc_xprt_iter *xpi);
+
 extern struct rpc_xprt_switch *xprt_iter_xchg_switch(
 		struct rpc_xprt_iter *xpi,
 		struct rpc_xprt_switch *newswitch);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 55da01730311..685db598acbe 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -481,7 +481,6 @@ struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi)
  * Resets xpi to ensure that it points to the first entry in the list
  * of transports.
  */
-static
 void xprt_iter_rewind(struct rpc_xprt_iter *xpi)
 {
 	rcu_read_lock();
-- 
cgit v1.2.3


From 92cc04f60ab4ae199eee507e5cd4d5aa6c722e9c Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Mon, 25 Jul 2022 09:32:30 -0400
Subject: SUNRPC create a function that probes only offline transports

For only offline transports, attempt to check connectivity via
a NULL call and, if that succeeds, call a provided session trunking
detection function.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 65 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 7a43fd514398..75eea5ebb179 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -235,6 +235,8 @@ int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
 			struct rpc_xprt *,
 			void *);
 void		rpc_clnt_manage_trunked_xprts(struct rpc_clnt *);
+void		rpc_clnt_probe_trunked_xprts(struct rpc_clnt *,
+			struct rpc_add_xprt_test *);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9c9712274ca8..bbfc47f03480 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -807,6 +807,13 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
 	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall);
 }
 
+static
+int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt,
+				    struct rpc_xprt_iter *xpi)
+{
+	return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline);
+}
+
 /**
  * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
  * @clnt: pointer to client
@@ -3018,6 +3025,64 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt,
+				  struct rpc_xprt *xprt,
+				  struct rpc_add_xprt_test *data)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *main_xprt;
+	int status = 0;
+
+	xprt_get(xprt);
+
+	rcu_read_lock();
+	main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+	status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr,
+				   (struct sockaddr *)&main_xprt->addr);
+	rcu_read_unlock();
+	xprt_put(main_xprt);
+	if (status || !test_bit(XPRT_OFFLINE, &xprt->state))
+		goto out;
+
+	status = rpc_clnt_add_xprt_helper(clnt, xprt, data);
+out:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	return status;
+}
+
+/* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking
+ * @clnt rpc_clnt structure
+ *
+ * For each offlined transport found in the rpc_clnt structure call
+ * the function rpc_xprt_probe_trunked() which will determine if this
+ * transport still belongs to the trunking group.
+ */
+void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt,
+				  struct rpc_add_xprt_test *data)
+{
+	struct rpc_xprt_iter xpi;
+	int ret;
+
+	ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi);
+	if (ret)
+		return;
+	for (;;) {
+		struct rpc_xprt *xprt = xprt_iter_get_next(&xpi);
+
+		if (!xprt)
+			break;
+		ret = rpc_xprt_probe_trunked(clnt, xprt, data);
+		xprt_put(xprt);
+		if (ret < 0)
+			break;
+		xprt_iter_rewind(&xpi);
+	}
+	xprt_iter_destroy(&xpi);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts);
+
 static int rpc_xprt_offline(struct rpc_clnt *clnt,
 			    struct rpc_xprt *xprt,
 			    void *data)
-- 
cgit v1.2.3


From 44abdd1646e1fbfb781972c0bffc90b4eb3e87b3 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:51 -0700
Subject: vfio: Pass in starting IOVA to vfio_pin/unpin_pages API

The vfio_pin/unpin_pages() so far accepted arrays of PFNs of user IOVA.
Among all three callers, there was only one caller possibly passing in
a non-contiguous PFN list, which is now ensured to have contiguous PFN
inputs too.

Pass in the starting address with "iova" alone to simplify things, so
callers no longer need to maintain a PFN list or to pin/unpin one page
at a time. This also allows VFIO to use more efficient implementations
of pin/unpin_pages.

For now, also update vfio_iommu_type1 to fit this new parameter too,
while keeping its input intact (being user_iova) since we don't want
to spend too much effort swapping its parameters and local variables
at that level.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Tony Krowiak <akrowiak@linux.ibm.com>
Acked-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-6-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  4 ++--
 drivers/gpu/drm/i915/gvt/kvmgt.c                  | 18 +++++----------
 drivers/s390/cio/vfio_ccw_cp.c                    |  4 ++--
 drivers/s390/crypto/vfio_ap_ops.c                 |  9 ++++----
 drivers/vfio/vfio.c                               | 27 ++++++++++-------------
 drivers/vfio/vfio.h                               |  4 ++--
 drivers/vfio/vfio_iommu_type1.c                   | 15 ++++++-------
 include/linux/vfio.h                              |  5 ++---
 8 files changed, 37 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index b0fdf76b339a..ea32a0f13ddb 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -262,10 +262,10 @@ Translation APIs for Mediated Devices
 The following APIs are provided for translating user pfn to host pfn in a VFIO
 driver::
 
-	int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 				  int npage, int prot, unsigned long *phys_pfn);
 
-	void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
+	void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova,
 				    int npage);
 
 These functions call back into the back-end IOMMU module by using the pin_pages
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 2fee5695515a..8be75c282611 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -231,14 +231,8 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
 static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size)
 {
-	int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	int npage;
-
-	for (npage = 0; npage < total_pages; npage++) {
-		unsigned long cur_gfn = gfn + npage;
-
-		vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1);
-	}
+	vfio_unpin_pages(&vgpu->vfio_device, gfn << PAGE_SHIFT,
+			 DIV_ROUND_UP(size, PAGE_SIZE));
 }
 
 /* Pin a normal or compound guest page for dma. */
@@ -255,14 +249,14 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 	 * on stack to hold pfns.
 	 */
 	for (npage = 0; npage < total_pages; npage++) {
-		unsigned long cur_gfn = gfn + npage;
+		dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
 		unsigned long pfn;
 
-		ret = vfio_pin_pages(&vgpu->vfio_device, &cur_gfn, 1,
+		ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
 				     IOMMU_READ | IOMMU_WRITE, &pfn);
 		if (ret != 1) {
-			gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
-				     cur_gfn, ret);
+			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
+				     &cur_iova, ret);
 			goto err;
 		}
 
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index 3b94863ad24e..a739262f988d 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -114,7 +114,7 @@ static void pfn_array_unpin(struct pfn_array *pa,
 			continue;
 		}
 
-		vfio_unpin_pages(vdev, first, npage);
+		vfio_unpin_pages(vdev, *first << PAGE_SHIFT, npage);
 		unpinned += npage;
 		npage = 1;
 	}
@@ -146,7 +146,7 @@ static int pfn_array_pin(struct pfn_array *pa, struct vfio_device *vdev)
 			continue;
 		}
 
-		ret = vfio_pin_pages(vdev, first, npage,
+		ret = vfio_pin_pages(vdev, *first << PAGE_SHIFT, npage,
 				     IOMMU_READ | IOMMU_WRITE,
 				     &pa->pa_pfn[pinned]);
 		if (ret < 0) {
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 5781059d3ed2..70f484668ca0 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -124,7 +124,7 @@ static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q)
 		q->saved_isc = VFIO_AP_ISC_INVALID;
 	}
 	if (q->saved_pfn && !WARN_ON(!q->matrix_mdev)) {
-		vfio_unpin_pages(&q->matrix_mdev->vdev, &q->saved_pfn, 1);
+		vfio_unpin_pages(&q->matrix_mdev->vdev, q->saved_pfn << PAGE_SHIFT, 1);
 		q->saved_pfn = 0;
 	}
 }
@@ -258,7 +258,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 		return status;
 	}
 
-	ret = vfio_pin_pages(&q->matrix_mdev->vdev, &g_pfn, 1,
+	ret = vfio_pin_pages(&q->matrix_mdev->vdev, g_pfn << PAGE_SHIFT, 1,
 			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
 	switch (ret) {
 	case 1:
@@ -301,7 +301,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 		break;
 	case AP_RESPONSE_OTHERWISE_CHANGED:
 		/* We could not modify IRQ setings: clear new configuration */
-		vfio_unpin_pages(&q->matrix_mdev->vdev, &g_pfn, 1);
+		vfio_unpin_pages(&q->matrix_mdev->vdev, g_pfn << PAGE_SHIFT, 1);
 		kvm_s390_gisc_unregister(kvm, isc);
 		break;
 	default:
@@ -1232,9 +1232,8 @@ static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova,
 {
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
-	unsigned long g_pfn = iova >> PAGE_SHIFT;
 
-	vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1);
+	vfio_unpin_pages(&matrix_mdev->vdev, iova, 1);
 }
 
 /**
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 92b10aafae28..ffd1a492eea9 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1934,17 +1934,17 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
 
 /*
- * Pin a set of guest PFNs and return their associated host PFNs for local
+ * Pin contiguous user pages and return their associated host pages for local
  * domain only.
  * @device [in]  : device
- * @user_pfn [in]: array of user/guest PFNs to be pinned.
- * @npage [in]   : count of elements in user_pfn array.  This count should not
- *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @iova [in]    : starting IOVA of user pages to be pinned.
+ * @npage [in]   : count of pages to be pinned.  This count should not
+ *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @prot [in]    : protection flags
  * @phys_pfn[out]: array of host PFNs
  * Return error or number of pages pinned.
  */
-int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn)
 {
 	struct vfio_container *container;
@@ -1952,8 +1952,7 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	struct vfio_iommu_driver *driver;
 	int ret;
 
-	if (!user_pfn || !phys_pfn || !npage ||
-	    !vfio_assert_device_open(device))
+	if (!phys_pfn || !npage || !vfio_assert_device_open(device))
 		return -EINVAL;
 
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
@@ -1967,7 +1966,7 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	driver = container->iommu_driver;
 	if (likely(driver && driver->ops->pin_pages))
 		ret = driver->ops->pin_pages(container->iommu_data,
-					     group->iommu_group, user_pfn,
+					     group->iommu_group, iova,
 					     npage, prot, phys_pfn);
 	else
 		ret = -ENOTTY;
@@ -1977,15 +1976,13 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
 EXPORT_SYMBOL(vfio_pin_pages);
 
 /*
- * Unpin set of host PFNs for local domain only.
+ * Unpin contiguous host pages for local domain only.
  * @device [in]  : device
- * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
- *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- * @npage [in]   : count of elements in user_pfn array.  This count should not
+ * @iova [in]    : starting address of user pages to be unpinned.
+ * @npage [in]   : count of pages to be unpinned.  This count should not
  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  */
-void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		      int npage)
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
 {
 	struct vfio_container *container;
 	struct vfio_iommu_driver *driver;
@@ -2000,7 +1997,7 @@ void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
 	container = device->group->container;
 	driver = container->iommu_driver;
 
-	driver->ops->unpin_pages(container->iommu_data, user_pfn, npage);
+	driver->ops->unpin_pages(container->iommu_data, iova, npage);
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 6a8424b407c7..e9767e13f00f 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -50,11 +50,11 @@ struct vfio_iommu_driver_ops {
 					struct iommu_group *group);
 	int		(*pin_pages)(void *iommu_data,
 				     struct iommu_group *group,
-				     unsigned long *user_pfn,
+				     dma_addr_t user_iova,
 				     int npage, int prot,
 				     unsigned long *phys_pfn);
 	void		(*unpin_pages)(void *iommu_data,
-				       unsigned long *user_pfn, int npage);
+				       dma_addr_t user_iova, int npage);
 	void		(*register_device)(void *iommu_data,
 					   struct vfio_device *vdev);
 	void		(*unregister_device)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e49fbe9968ef..e629e059118c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -829,7 +829,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 
 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
-				      unsigned long *user_pfn,
+				      dma_addr_t user_iova,
 				      int npage, int prot,
 				      unsigned long *phys_pfn)
 {
@@ -841,7 +841,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	bool do_accounting;
 	dma_addr_t iova;
 
-	if (!iommu || !user_pfn || !phys_pfn)
+	if (!iommu || !phys_pfn)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -857,7 +857,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 again:
 	if (iommu->vaddr_invalid_count) {
 		for (i = 0; i < npage; i++) {
-			iova = user_pfn[i] << PAGE_SHIFT;
+			iova = user_iova + PAGE_SIZE * i;
 			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
 			if (ret < 0)
 				goto pin_done;
@@ -882,7 +882,7 @@ again:
 	for (i = 0; i < npage; i++) {
 		struct vfio_pfn *vpfn;
 
-		iova = user_pfn[i] << PAGE_SHIFT;
+		iova = user_iova + PAGE_SIZE * i;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma) {
 			ret = -EINVAL;
@@ -939,7 +939,7 @@ pin_unwind:
 	for (j = 0; j < i; j++) {
 		dma_addr_t iova;
 
-		iova = user_pfn[j] << PAGE_SHIFT;
+		iova = user_iova + PAGE_SIZE * j;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		vfio_unpin_page_external(dma, iova, do_accounting);
 		phys_pfn[j] = 0;
@@ -950,7 +950,7 @@ pin_done:
 }
 
 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
-					 unsigned long *user_pfn, int npage)
+					 dma_addr_t user_iova, int npage)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	bool do_accounting;
@@ -964,10 +964,9 @@ static void vfio_iommu_type1_unpin_pages(void *iommu_data,
 
 	do_accounting = list_empty(&iommu->domain_list);
 	for (i = 0; i < npage; i++) {
+		dma_addr_t iova = user_iova + PAGE_SIZE * i;
 		struct vfio_dma *dma;
-		dma_addr_t iova;
 
-		iova = user_pfn[i] << PAGE_SHIFT;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		if (!dma)
 			break;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 9f7d74c24925..9e3b6abcf890 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -161,10 +161,9 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
-int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn);
-void vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
-		      int npage);
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
 		void *data, size_t len, bool write);
 
-- 
cgit v1.2.3


From 8561aa4fb7d72011c2352af2b1b9caf588942181 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:54 -0700
Subject: vfio: Rename user_iova of vfio_dma_rw()

Following the updated vfio_pin/unpin_pages(), use the simpler "iova".

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-9-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 6 +++---
 include/linux/vfio.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index ffd1a492eea9..606a20b605ba 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2012,13 +2012,13 @@ EXPORT_SYMBOL(vfio_unpin_pages);
  * not a real device DMA, it is not necessary to pin the user space memory.
  *
  * @device [in]		: VFIO device
- * @user_iova [in]	: base IOVA of a user space buffer
+ * @iova [in]		: base IOVA of a user space buffer
  * @data [in]		: pointer to kernel buffer
  * @len [in]		: kernel buffer length
  * @write		: indicate read or write
  * Return error code on failure or 0 on success.
  */
-int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
 		size_t len, bool write)
 {
 	struct vfio_container *container;
@@ -2034,7 +2034,7 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data,
 
 	if (likely(driver && driver->ops->dma_rw))
 		ret = driver->ops->dma_rw(container->iommu_data,
-					  user_iova, data, len, write);
+					  iova, data, len, write);
 	else
 		ret = -ENOTTY;
 	return ret;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 9e3b6abcf890..acefd663e63b 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -164,7 +164,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 		   int npage, int prot, unsigned long *phys_pfn);
 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
-int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova,
 		void *data, size_t len, bool write);
 
 /*
-- 
cgit v1.2.3


From 34a255e67615995f729254307a0581c143e03752 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 22 Jul 2022 19:02:56 -0700
Subject: vfio: Replace phys_pfn with pages for vfio_pin_pages()

Most of the callers of vfio_pin_pages() want "struct page *" and the
low-level mm code to pin pages returns a list of "struct page *" too.
So there's no gain in converting "struct page *" to PFN in between.

Replace the output parameter "phys_pfn" list with a "pages" list, to
simplify callers. This also allows us to replace the vfio_iommu_type1
implementation with a more efficient one.

And drop the pfn_valid check in the gvt code, as there is no need to
do such a check at a page-backed struct page pointer.

For now, also update vfio_iommu_type1 to fit this new parameter too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Eric Farman <farman@linux.ibm.com>
Tested-by: Terrence Xu <terrence.xu@intel.com>
Tested-by: Eric Farman <farman@linux.ibm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20220723020256.30081-11-nicolinc@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  2 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c                  | 19 ++++++-------------
 drivers/s390/cio/vfio_ccw_cp.c                    | 19 +++++++++----------
 drivers/s390/crypto/vfio_ap_ops.c                 |  6 +++---
 drivers/vfio/vfio.c                               |  8 ++++----
 drivers/vfio/vfio.h                               |  2 +-
 drivers/vfio/vfio_iommu_type1.c                   | 19 +++++++++++--------
 include/linux/vfio.h                              |  2 +-
 8 files changed, 36 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index ea32a0f13ddb..ba5fefcdae1a 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -263,7 +263,7 @@ The following APIs are provided for translating user pfn to host pfn in a VFIO
 driver::
 
 	int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-				  int npage, int prot, unsigned long *phys_pfn);
+				  int npage, int prot, struct page **pages);
 
 	void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova,
 				    int npage);
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 8be75c282611..e3cd58946477 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -240,7 +240,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		unsigned long size, struct page **page)
 {
 	int total_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	unsigned long base_pfn = 0;
+	struct page *base_page = NULL;
 	int npage;
 	int ret;
 
@@ -250,26 +250,19 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 	 */
 	for (npage = 0; npage < total_pages; npage++) {
 		dma_addr_t cur_iova = (gfn + npage) << PAGE_SHIFT;
-		unsigned long pfn;
+		struct page *cur_page;
 
 		ret = vfio_pin_pages(&vgpu->vfio_device, cur_iova, 1,
-				     IOMMU_READ | IOMMU_WRITE, &pfn);
+				     IOMMU_READ | IOMMU_WRITE, &cur_page);
 		if (ret != 1) {
 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
 				     &cur_iova, ret);
 			goto err;
 		}
 
-		if (!pfn_valid(pfn)) {
-			gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn);
-			npage++;
-			ret = -EFAULT;
-			goto err;
-		}
-
 		if (npage == 0)
-			base_pfn = pfn;
-		else if (base_pfn + npage != pfn) {
+			base_page = cur_page;
+		else if (base_page + npage != cur_page) {
 			gvt_vgpu_err("The pages are not continuous\n");
 			ret = -EINVAL;
 			npage++;
@@ -277,7 +270,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
 		}
 	}
 
-	*page = pfn_to_page(base_pfn);
+	*page = base_page;
 	return 0;
 err:
 	gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index cd4ec4f6d6ff..8963f452f963 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -22,8 +22,8 @@
 struct page_array {
 	/* Array that stores pages need to pin. */
 	dma_addr_t		*pa_iova;
-	/* Array that receives PFNs of the pages pinned. */
-	unsigned long		*pa_pfn;
+	/* Array that receives the pinned pages. */
+	struct page		**pa_page;
 	/* Number of pages pinned from @pa_iova. */
 	int			pa_nr;
 };
@@ -68,19 +68,19 @@ static int page_array_alloc(struct page_array *pa, u64 iova, unsigned int len)
 		return -EINVAL;
 
 	pa->pa_iova = kcalloc(pa->pa_nr,
-			      sizeof(*pa->pa_iova) + sizeof(*pa->pa_pfn),
+			      sizeof(*pa->pa_iova) + sizeof(*pa->pa_page),
 			      GFP_KERNEL);
 	if (unlikely(!pa->pa_iova)) {
 		pa->pa_nr = 0;
 		return -ENOMEM;
 	}
-	pa->pa_pfn = (unsigned long *)&pa->pa_iova[pa->pa_nr];
+	pa->pa_page = (struct page **)&pa->pa_iova[pa->pa_nr];
 
 	pa->pa_iova[0] = iova;
-	pa->pa_pfn[0] = -1ULL;
+	pa->pa_page[0] = NULL;
 	for (i = 1; i < pa->pa_nr; i++) {
 		pa->pa_iova[i] = pa->pa_iova[i - 1] + PAGE_SIZE;
-		pa->pa_pfn[i] = -1ULL;
+		pa->pa_page[i] = NULL;
 	}
 
 	return 0;
@@ -144,7 +144,7 @@ static int page_array_pin(struct page_array *pa, struct vfio_device *vdev)
 
 		ret = vfio_pin_pages(vdev, *first, npage,
 				     IOMMU_READ | IOMMU_WRITE,
-				     &pa->pa_pfn[pinned]);
+				     &pa->pa_page[pinned]);
 		if (ret < 0) {
 			goto err_out;
 		} else if (ret > 0 && ret != npage) {
@@ -195,7 +195,7 @@ static inline void page_array_idal_create_words(struct page_array *pa,
 	 */
 
 	for (i = 0; i < pa->pa_nr; i++)
-		idaws[i] = pa->pa_pfn[i] << PAGE_SHIFT;
+		idaws[i] = page_to_phys(pa->pa_page[i]);
 
 	/* Adjust the first IDAW, since it may not start on a page boundary */
 	idaws[0] += pa->pa_iova[0] & (PAGE_SIZE - 1);
@@ -246,8 +246,7 @@ static long copy_from_iova(struct vfio_device *vdev, void *to, u64 iova,
 
 	l = n;
 	for (i = 0; i < pa.pa_nr; i++) {
-		struct page *page = pfn_to_page(pa.pa_pfn[i]);
-		void *from = kmap_local_page(page);
+		void *from = kmap_local_page(pa.pa_page[i]);
 
 		m = PAGE_SIZE;
 		if (i == 0) {
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index d7c38c82f694..75cd92c291e3 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -234,9 +234,9 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	struct ap_qirq_ctrl aqic_gisa = {};
 	struct ap_queue_status status = {};
 	struct kvm_s390_gisa *gisa;
+	struct page *h_page;
 	int nisc;
 	struct kvm *kvm;
-	unsigned long h_pfn;
 	phys_addr_t h_nib;
 	dma_addr_t nib;
 	int ret;
@@ -251,7 +251,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	}
 
 	ret = vfio_pin_pages(&q->matrix_mdev->vdev, nib, 1,
-			     IOMMU_READ | IOMMU_WRITE, &h_pfn);
+			     IOMMU_READ | IOMMU_WRITE, &h_page);
 	switch (ret) {
 	case 1:
 		break;
@@ -267,7 +267,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q,
 	kvm = q->matrix_mdev->kvm;
 	gisa = kvm->arch.gisa_int.origin;
 
-	h_nib = (h_pfn << PAGE_SHIFT) | (nib & ~PAGE_MASK);
+	h_nib = page_to_phys(h_page) | (nib & ~PAGE_MASK);
 	aqic_gisa.gisc = isc;
 
 	nisc = kvm_s390_gisc_register(kvm, isc);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 606a20b605ba..8e23ca59ceed 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1941,18 +1941,18 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
  * @npage [in]   : count of pages to be pinned.  This count should not
  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
  * @prot [in]    : protection flags
- * @phys_pfn[out]: array of host PFNs
+ * @pages[out]   : array of host pages
  * Return error or number of pages pinned.
  */
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-		   int npage, int prot, unsigned long *phys_pfn)
+		   int npage, int prot, struct page **pages)
 {
 	struct vfio_container *container;
 	struct vfio_group *group = device->group;
 	struct vfio_iommu_driver *driver;
 	int ret;
 
-	if (!phys_pfn || !npage || !vfio_assert_device_open(device))
+	if (!pages || !npage || !vfio_assert_device_open(device))
 		return -EINVAL;
 
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
@@ -1967,7 +1967,7 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
 	if (likely(driver && driver->ops->pin_pages))
 		ret = driver->ops->pin_pages(container->iommu_data,
 					     group->iommu_group, iova,
-					     npage, prot, phys_pfn);
+					     npage, prot, pages);
 	else
 		ret = -ENOTTY;
 
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index e9767e13f00f..503bea6c843d 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -52,7 +52,7 @@ struct vfio_iommu_driver_ops {
 				     struct iommu_group *group,
 				     dma_addr_t user_iova,
 				     int npage, int prot,
-				     unsigned long *phys_pfn);
+				     struct page **pages);
 	void		(*unpin_pages)(void *iommu_data,
 				       dma_addr_t user_iova, int npage);
 	void		(*register_device)(void *iommu_data,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e629e059118c..db516c90a977 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -831,7 +831,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 				      struct iommu_group *iommu_group,
 				      dma_addr_t user_iova,
 				      int npage, int prot,
-				      unsigned long *phys_pfn)
+				      struct page **pages)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_iommu_group *group;
@@ -841,7 +841,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
 	bool do_accounting;
 	dma_addr_t iova;
 
-	if (!iommu || !phys_pfn)
+	if (!iommu || !pages)
 		return -EINVAL;
 
 	/* Supported for v2 version only */
@@ -880,6 +880,7 @@ again:
 	do_accounting = list_empty(&iommu->domain_list);
 
 	for (i = 0; i < npage; i++) {
+		unsigned long phys_pfn;
 		struct vfio_pfn *vpfn;
 
 		iova = user_iova + PAGE_SIZE * i;
@@ -896,23 +897,25 @@ again:
 
 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 		if (vpfn) {
-			phys_pfn[i] = vpfn->pfn;
+			pages[i] = pfn_to_page(vpfn->pfn);
 			continue;
 		}
 
 		remote_vaddr = dma->vaddr + (iova - dma->iova);
-		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
+		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
 					     do_accounting);
 		if (ret)
 			goto pin_unwind;
 
-		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
+		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
 		if (ret) {
-			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
+			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
 				vfio_lock_acct(dma, -1, true);
 			goto pin_unwind;
 		}
 
+		pages[i] = pfn_to_page(phys_pfn);
+
 		if (iommu->dirty_page_tracking) {
 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 
@@ -935,14 +938,14 @@ again:
 	goto pin_done;
 
 pin_unwind:
-	phys_pfn[i] = 0;
+	pages[i] = NULL;
 	for (j = 0; j < i; j++) {
 		dma_addr_t iova;
 
 		iova = user_iova + PAGE_SIZE * j;
 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 		vfio_unpin_page_external(dma, iova, do_accounting);
-		phys_pfn[j] = 0;
+		pages[j] = NULL;
 	}
 pin_done:
 	mutex_unlock(&iommu->lock);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index acefd663e63b..e05ddc6fe6a5 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -162,7 +162,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
 #define VFIO_PIN_PAGES_MAX_ENTRIES	(PAGE_SIZE/sizeof(unsigned long))
 
 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-		   int npage, int prot, unsigned long *phys_pfn);
+		   int npage, int prot, struct page **pages);
 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage);
 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova,
 		void *data, size_t len, bool write);
-- 
cgit v1.2.3


From e948d32c54fa45cc1601c98956bdbbf5f17a3db2 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:17 +0200
Subject: video: fbdev: imxfb: Drop platform data support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No source file but the driver itself includes the header containing the
platform data definition. The last user is gone since commit
8485adf17a15 ("ARM: imx: Remove imx device directory").

So we can safely drop platform data support.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/video/fbdev/imxfb.c               | 99 +++++++++++--------------------
 include/linux/platform_data/video-imxfb.h | 12 ----
 2 files changed, 34 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index a2f644c97f28..85a5bf5639d9 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -656,7 +656,6 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf
 
 static int imxfb_init_fbinfo(struct platform_device *pdev)
 {
-	struct imx_fb_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct fb_info *info = platform_get_drvdata(pdev);
 	struct imxfb_info *fbi = info->par;
 	struct device_node *np;
@@ -690,25 +689,20 @@ static int imxfb_init_fbinfo(struct platform_device *pdev)
 	info->fbops			= &imxfb_ops;
 	info->flags			= FBINFO_FLAG_DEFAULT |
 					  FBINFO_READS_FAST;
-	if (pdata) {
-		fbi->lscr1			= pdata->lscr1;
-		fbi->dmacr			= pdata->dmacr;
-		fbi->pwmr			= pdata->pwmr;
-	} else {
-		np = pdev->dev.of_node;
-		info->var.grayscale = of_property_read_bool(np,
-						"cmap-greyscale");
-		fbi->cmap_inverse = of_property_read_bool(np, "cmap-inverse");
-		fbi->cmap_static = of_property_read_bool(np, "cmap-static");
 
-		fbi->lscr1 = IMXFB_LSCR1_DEFAULT;
+	np = pdev->dev.of_node;
+	info->var.grayscale = of_property_read_bool(np,
+					"cmap-greyscale");
+	fbi->cmap_inverse = of_property_read_bool(np, "cmap-inverse");
+	fbi->cmap_static = of_property_read_bool(np, "cmap-static");
 
-		of_property_read_u32(np, "fsl,lpccr", &fbi->pwmr);
+	fbi->lscr1 = IMXFB_LSCR1_DEFAULT;
 
-		of_property_read_u32(np, "fsl,lscr1", &fbi->lscr1);
+	of_property_read_u32(np, "fsl,lpccr", &fbi->pwmr);
 
-		of_property_read_u32(np, "fsl,dmacr", &fbi->dmacr);
-	}
+	of_property_read_u32(np, "fsl,lscr1", &fbi->lscr1);
+
+	of_property_read_u32(np, "fsl,dmacr", &fbi->dmacr);
 
 	return 0;
 }
@@ -863,10 +857,10 @@ static int imxfb_probe(struct platform_device *pdev)
 	struct imxfb_info *fbi;
 	struct lcd_device *lcd;
 	struct fb_info *info;
-	struct imx_fb_platform_data *pdata;
 	struct resource *res;
 	struct imx_fb_videomode *m;
 	const struct of_device_id *of_id;
+	struct device_node *display_np;
 	int ret, i;
 	int bytes_per_pixel;
 
@@ -884,8 +878,6 @@ static int imxfb_probe(struct platform_device *pdev)
 	if (!res)
 		return -ENODEV;
 
-	pdata = dev_get_platdata(&pdev->dev);
-
 	info = framebuffer_alloc(sizeof(struct imxfb_info), &pdev->dev);
 	if (!info)
 		return -ENOMEM;
@@ -898,43 +890,34 @@ static int imxfb_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto failed_init;
 
-	if (pdata) {
-		if (!fb_mode)
-			fb_mode = pdata->mode[0].mode.name;
-
-		fbi->mode = pdata->mode;
-		fbi->num_modes = pdata->num_modes;
-	} else {
-		struct device_node *display_np;
-		fb_mode = NULL;
-
-		display_np = of_parse_phandle(pdev->dev.of_node, "display", 0);
-		if (!display_np) {
-			dev_err(&pdev->dev, "No display defined in devicetree\n");
-			ret = -EINVAL;
-			goto failed_of_parse;
-		}
+	fb_mode = NULL;
 
-		/*
-		 * imxfb does not support more modes, we choose only the native
-		 * mode.
-		 */
-		fbi->num_modes = 1;
-
-		fbi->mode = devm_kzalloc(&pdev->dev,
-				sizeof(struct imx_fb_videomode), GFP_KERNEL);
-		if (!fbi->mode) {
-			ret = -ENOMEM;
-			of_node_put(display_np);
-			goto failed_of_parse;
-		}
+	display_np = of_parse_phandle(pdev->dev.of_node, "display", 0);
+	if (!display_np) {
+		dev_err(&pdev->dev, "No display defined in devicetree\n");
+		ret = -EINVAL;
+		goto failed_of_parse;
+	}
 
-		ret = imxfb_of_read_mode(&pdev->dev, display_np, fbi->mode);
+	/*
+	 * imxfb does not support more modes, we choose only the native
+	 * mode.
+	 */
+	fbi->num_modes = 1;
+
+	fbi->mode = devm_kzalloc(&pdev->dev,
+			sizeof(struct imx_fb_videomode), GFP_KERNEL);
+	if (!fbi->mode) {
+		ret = -ENOMEM;
 		of_node_put(display_np);
-		if (ret)
-			goto failed_of_parse;
+		goto failed_of_parse;
 	}
 
+	ret = imxfb_of_read_mode(&pdev->dev, display_np, fbi->mode);
+	of_node_put(display_np);
+	if (ret)
+		goto failed_of_parse;
+
 	/* Calculate maximum bytes used per pixel. In most cases this should
 	 * be the same as m->bpp/8 */
 	m = &fbi->mode[0];
@@ -1001,13 +984,6 @@ static int imxfb_probe(struct platform_device *pdev)
 
 	info->fix.smem_start = fbi->map_dma;
 
-	if (pdata && pdata->init) {
-		ret = pdata->init(fbi->pdev);
-		if (ret)
-			goto failed_platform_init;
-	}
-
-
 	INIT_LIST_HEAD(&info->modelist);
 	for (i = 0; i < fbi->num_modes; i++)
 		fb_add_videomode(&fbi->mode[i].mode, &info->modelist);
@@ -1059,9 +1035,6 @@ failed_lcd:
 failed_register:
 	fb_dealloc_cmap(&info->cmap);
 failed_cmap:
-	if (pdata && pdata->exit)
-		pdata->exit(fbi->pdev);
-failed_platform_init:
 	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_buffer,
 		    fbi->map_dma);
 failed_map:
@@ -1079,7 +1052,6 @@ failed_init:
 
 static int imxfb_remove(struct platform_device *pdev)
 {
-	struct imx_fb_platform_data *pdata;
 	struct fb_info *info = platform_get_drvdata(pdev);
 	struct imxfb_info *fbi = info->par;
 	struct resource *res;
@@ -1092,9 +1064,6 @@ static int imxfb_remove(struct platform_device *pdev)
 
 	unregister_framebuffer(info);
 	fb_dealloc_cmap(&info->cmap);
-	pdata = dev_get_platdata(&pdev->dev);
-	if (pdata && pdata->exit)
-		pdata->exit(fbi->pdev);
 	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_buffer,
 		    fbi->map_dma);
 	iounmap(fbi->regs);
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index 02812651af7d..b80a156a6617 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -55,16 +55,4 @@ struct imx_fb_videomode {
 	unsigned char	bpp;
 };
 
-struct imx_fb_platform_data {
-	struct imx_fb_videomode *mode;
-	int		num_modes;
-
-	u_int		pwmr;
-	u_int		lscr1;
-	u_int		dmacr;
-
-	int (*init)(struct platform_device *);
-	void (*exit)(struct platform_device *);
-};
-
 #endif /* ifndef __MACH_IMXFB_H__ */
-- 
cgit v1.2.3


From e2279cc92919f37b4af985cb28ae350bb3e62e71 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:18 +0200
Subject: video: fbdev: imxfb: Drop unused symbols from header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only file that includes <linux/platform_data/video-imxfb.h> is the
imxfb driver. Drop all symbols that are unused there.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 include/linux/platform_data/video-imxfb.h | 35 -------------------------------
 1 file changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index b80a156a6617..a16837c5e43c 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -8,45 +8,10 @@
 #include <linux/fb.h>
 
 #define PCR_TFT		(1 << 31)
-#define PCR_COLOR	(1 << 30)
-#define PCR_PBSIZ_1	(0 << 28)
-#define PCR_PBSIZ_2	(1 << 28)
-#define PCR_PBSIZ_4	(2 << 28)
-#define PCR_PBSIZ_8	(3 << 28)
-#define PCR_BPIX_1	(0 << 25)
-#define PCR_BPIX_2	(1 << 25)
-#define PCR_BPIX_4	(2 << 25)
 #define PCR_BPIX_8	(3 << 25)
 #define PCR_BPIX_12	(4 << 25)
 #define PCR_BPIX_16	(5 << 25)
 #define PCR_BPIX_18	(6 << 25)
-#define PCR_PIXPOL	(1 << 24)
-#define PCR_FLMPOL	(1 << 23)
-#define PCR_LPPOL	(1 << 22)
-#define PCR_CLKPOL	(1 << 21)
-#define PCR_OEPOL	(1 << 20)
-#define PCR_SCLKIDLE	(1 << 19)
-#define PCR_END_SEL	(1 << 18)
-#define PCR_END_BYTE_SWAP (1 << 17)
-#define PCR_REV_VS	(1 << 16)
-#define PCR_ACD_SEL	(1 << 15)
-#define PCR_ACD(x)	(((x) & 0x7f) << 8)
-#define PCR_SCLK_SEL	(1 << 7)
-#define PCR_SHARP	(1 << 6)
-#define PCR_PCD(x)	((x) & 0x3f)
-
-#define PWMR_CLS(x)	(((x) & 0x1ff) << 16)
-#define PWMR_LDMSK	(1 << 15)
-#define PWMR_SCR1	(1 << 10)
-#define PWMR_SCR0	(1 << 9)
-#define PWMR_CC_EN	(1 << 8)
-#define PWMR_PW(x)	((x) & 0xff)
-
-#define LSCR1_PS_RISE_DELAY(x)    (((x) & 0x7f) << 26)
-#define LSCR1_CLS_RISE_DELAY(x)   (((x) & 0x3f) << 16)
-#define LSCR1_REV_TOGGLE_DELAY(x) (((x) & 0xf) << 8)
-#define LSCR1_GRAY2(x)            (((x) & 0xf) << 4)
-#define LSCR1_GRAY1(x)            (((x) & 0xf))
 
 struct imx_fb_videomode {
 	struct fb_videomode mode;
-- 
cgit v1.2.3


From ded77a74ee6bc3dea72ad41129823a812e4b64f3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 23 Jul 2022 19:57:19 +0200
Subject: video: fbdev: imxfb: Fold <linux/platform_data/video-imxfb.h> into
 only user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No source file but the driver itself includes the header containing the
platform data definition. The last user is gone since commit
8485adf17a15 ("ARM: imx: Remove imx device directory").

Move the remaining symbols directly into the driver and remove the then
unused header file.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 MAINTAINERS                               |  1 -
 drivers/video/fbdev/imxfb.c               | 13 ++++++++++++-
 include/linux/platform_data/video-imxfb.h | 23 -----------------------
 3 files changed, 12 insertions(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/video-imxfb.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 651616ed8ae2..75c4d7ebef09 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8011,7 +8011,6 @@ L:	linux-fbdev@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	drivers/video/fbdev/imxfb.c
-F:	include/linux/platform_data/video-imxfb.h
 
 FREESCALE IMX DDR PMU DRIVER
 M:	Frank Li <Frank.li@nxp.com>
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index 85a5bf5639d9..fa6a19c1ae9b 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -41,7 +41,18 @@
 #include <video/of_videomode.h>
 #include <video/videomode.h>
 
-#include <linux/platform_data/video-imxfb.h>
+#define PCR_TFT		(1 << 31)
+#define PCR_BPIX_8	(3 << 25)
+#define PCR_BPIX_12	(4 << 25)
+#define PCR_BPIX_16	(5 << 25)
+#define PCR_BPIX_18	(6 << 25)
+
+struct imx_fb_videomode {
+	struct fb_videomode mode;
+	u32 pcr;
+	bool aus_mode;
+	unsigned char	bpp;
+};
 
 /*
  * Complain if VAR is out of range.
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
deleted file mode 100644
index a16837c5e43c..000000000000
--- a/include/linux/platform_data/video-imxfb.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This structure describes the machine which we are running on.
- */
-#ifndef __MACH_IMXFB_H__
-#define __MACH_IMXFB_H__
-
-#include <linux/fb.h>
-
-#define PCR_TFT		(1 << 31)
-#define PCR_BPIX_8	(3 << 25)
-#define PCR_BPIX_12	(4 << 25)
-#define PCR_BPIX_16	(5 << 25)
-#define PCR_BPIX_18	(6 << 25)
-
-struct imx_fb_videomode {
-	struct fb_videomode mode;
-	u32 pcr;
-	bool aus_mode;
-	unsigned char	bpp;
-};
-
-#endif /* ifndef __MACH_IMXFB_H__ */
-- 
cgit v1.2.3


From 42399301203e3cddef36cde457228f9247618313 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:52 -0600
Subject: lib/scatterlist: add flag for indicating P2PDMA segments in an SGL

Introduce a dma_flags field in struct scatterlist. These flags will be
used by dma_[un]map_sg_p2pdma() to determine when a given SGL segments
dma_address points to a PCI bus address. dma_unmap_sg_p2pdma() will need
to perform different cleanup when a segment is marked as a bus address.

The dma_flags field will fit in the existing padding on 64BIT systems
(assuming CONFIG_NEED_SG_DMA_LENGTH is also set).

The new bit will only be used when CONFIG_PCI_P2PDMA is set; this means
PCI P2PDMA will require CONFIG_64BIT. This should be acceptable as the
majority of P2PDMA use cases are restricted to newer root complexes and
roughly require the extra address space for memory BARs used in the
transactions.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/Kconfig         |  5 ++++
 include/linux/scatterlist.h | 69 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 133c73207782..5cc7cba1941f 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -164,6 +164,11 @@ config PCI_PASID
 config PCI_P2PDMA
 	bool "PCI peer-to-peer transfer support"
 	depends on ZONE_DEVICE
+	#
+	# The need for the scatterlist DMA bus address flag means PCI P2PDMA
+	# requires 64bit
+	#
+	depends on 64BIT
 	select GENERIC_ALLOCATOR
 	help
 	  Enableѕ drivers to do PCI peer-to-peer transactions to and from
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 7ff9d6386c12..375a5e90d86a 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -16,6 +16,9 @@ struct scatterlist {
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 	unsigned int	dma_length;
 #endif
+#ifdef CONFIG_PCI_P2PDMA
+	unsigned int    dma_flags;
+#endif
 };
 
 /*
@@ -245,6 +248,72 @@ static inline void sg_unmark_end(struct scatterlist *sg)
 	sg->page_link &= ~SG_END;
 }
 
+/*
+ * CONFGI_PCI_P2PDMA depends on CONFIG_64BIT which means there is 4 bytes
+ * in struct scatterlist (assuming also CONFIG_NEED_SG_DMA_LENGTH is set).
+ * Use this padding for DMA flags bits to indicate when a specific
+ * dma address is a bus address.
+ */
+#ifdef CONFIG_PCI_P2PDMA
+
+#define SG_DMA_BUS_ADDRESS (1 << 0)
+
+/**
+ * sg_dma_is_bus address - Return whether a given segment was marked
+ *			   as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Returns true if sg_dma_mark_bus_address() has been called on
+ *   this segment.
+ **/
+static inline bool sg_is_dma_bus_address(struct scatterlist *sg)
+{
+	return sg->dma_flags & SG_DMA_BUS_ADDRESS;
+}
+
+/**
+ * sg_dma_mark_bus address - Mark the scatterlist entry as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Marks the passed in sg entry to indicate that the dma_address is
+ *   a bus address and doesn't need to be unmapped. This should only be
+ *   used by dma_map_sg() implementations to mark bus addresses
+ *   so they can be properly cleaned up in dma_unmap_sg().
+ **/
+static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
+{
+	sg->dma_flags |= SG_DMA_BUS_ADDRESS;
+}
+
+/**
+ * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address
+ * @sg:		 SG entry
+ *
+ * Description:
+ *   Clears the bus address mark.
+ **/
+static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
+{
+	sg->dma_flags &= ~SG_DMA_BUS_ADDRESS;
+}
+
+#else
+
+static inline bool sg_is_dma_bus_address(struct scatterlist *sg)
+{
+	return false;
+}
+static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
+{
+}
+static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
+{
+}
+
+#endif
+
 /**
  * sg_phys - Return physical address of an sg entry
  * @sg:	     SG entry
-- 
cgit v1.2.3


From 5e180ff326b43bd3fb72892e1083b2707159aa6e Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:54 -0600
Subject: PCI/P2PDMA: Introduce helpers for dma_map_sg implementations

Add pci_p2pdma_map_segment() as a helper for dma_map_sg()
implementations. It takes an scatterlist segment that must point to a
pci_p2pdma struct page and will map it if the mapping requires a bus
address.

The return value indicates whether the mapping required a bus address
or whether the caller still needs to map the segment normally. If the
segment should not be mapped, -EREMOTEIO is returned.

This helper uses a state structure to track the changes to the
pgmap across calls and avoid needing to lookup into the xarray for
every page.

The prototype for the helper is added to dma-map-ops.h as it is only
useful to dma map implementations and don't need to pollute the public
pci-p2pdma header.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/p2pdma.c        | 44 +++++++++++++++++++++++++++++++------
 include/linux/dma-map-ops.h | 53 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4e8bc457e29a..5d2538aa0778 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -10,6 +10,7 @@
 
 #define pr_fmt(fmt) "pci-p2pdma: " fmt
 #include <linux/ctype.h>
+#include <linux/dma-map-ops.h>
 #include <linux/pci-p2pdma.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -20,13 +21,6 @@
 #include <linux/seq_buf.h>
 #include <linux/xarray.h>
 
-enum pci_p2pdma_map_type {
-	PCI_P2PDMA_MAP_UNKNOWN = 0,
-	PCI_P2PDMA_MAP_NOT_SUPPORTED,
-	PCI_P2PDMA_MAP_BUS_ADDR,
-	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
 struct pci_p2pdma {
 	struct gen_pool *pool;
 	bool p2pmem_published;
@@ -944,6 +938,42 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
+/**
+ * pci_p2pdma_map_segment - map an sg segment determining the mapping type
+ * @state: State structure that should be declared outside of the for_each_sg()
+ *	loop and initialized to zero.
+ * @dev: DMA device that's doing the mapping operation
+ * @sg: scatterlist segment to map
+ *
+ * This is a helper to be used by non-IOMMU dma_map_sg() implementations where
+ * the sg segment is the same for the page_link and the dma_address.
+ *
+ * Attempt to map a single segment in an SGL with the PCI bus address.
+ * The segment must point to a PCI P2PDMA page and thus must be
+ * wrapped in a is_pci_p2pdma_page(sg_page(sg)) check.
+ *
+ * Returns the type of mapping used and maps the page if the type is
+ * PCI_P2PDMA_MAP_BUS_ADDR.
+ */
+enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg)
+{
+	if (state->pgmap != sg_page(sg)->pgmap) {
+		state->pgmap = sg_page(sg)->pgmap;
+		state->map = pci_p2pdma_map_type(state->pgmap, dev);
+		state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+	}
+
+	if (state->map == PCI_P2PDMA_MAP_BUS_ADDR) {
+		sg->dma_address = sg_phys(sg) + state->bus_off;
+		sg_dma_len(sg) = sg->length;
+		sg_dma_mark_bus_address(sg);
+	}
+
+	return state->map;
+}
+
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
  *		to enable p2pdma
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 98ceba6fa848..99cec59dbfcb 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -380,4 +380,57 @@ static inline void debug_dma_dump_mappings(struct device *dev)
 
 extern const struct dma_map_ops dma_dummy_ops;
 
+enum pci_p2pdma_map_type {
+	/*
+	 * PCI_P2PDMA_MAP_UNKNOWN: Used internally for indicating the mapping
+	 * type hasn't been calculated yet. Functions that return this enum
+	 * never return this value.
+	 */
+	PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+	/*
+	 * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+	 * traverse the host bridge and the host bridge is not in the
+	 * allowlist. DMA Mapping routines should return an error when
+	 * this is returned.
+	 */
+	PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+	/*
+	 * PCI_P2PDMA_BUS_ADDR: Indicates that two devices can talk to
+	 * each other directly through a PCI switch and the transaction will
+	 * not traverse the host bridge. Such a mapping should program
+	 * the DMA engine with PCI bus addresses.
+	 */
+	PCI_P2PDMA_MAP_BUS_ADDR,
+
+	/*
+	 * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+	 * to each other, but the transaction traverses a host bridge on the
+	 * allowlist. In this case, a normal mapping either with CPU physical
+	 * addresses (in the case of dma-direct) or IOVA addresses (in the
+	 * case of IOMMUs) should be used to program the DMA engine.
+	 */
+	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
+struct pci_p2pdma_map_state {
+	struct dev_pagemap *pgmap;
+	int map;
+	u64 bus_off;
+};
+
+#ifdef CONFIG_PCI_P2PDMA
+enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg);
+#else /* CONFIG_PCI_P2PDMA */
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device *dev,
+		       struct scatterlist *sg)
+{
+	return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
+#endif /* CONFIG_PCI_P2PDMA */
+
 #endif /* _LINUX_DMA_MAP_OPS_H */
-- 
cgit v1.2.3


From 159bf19270e80b5bc4b13aa88072dcb390b4d297 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:50:57 -0600
Subject: dma-mapping: add flags to dma_map_ops to indicate PCI P2PDMA support

Add a flags member to the dma_map_ops structure with one flag to
indicate support for PCI P2PDMA.

Also, add a helper to check if a device supports PCI P2PDMA.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 10 ++++++++++
 include/linux/dma-mapping.h |  5 +++++
 kernel/dma/mapping.c        | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 99cec59dbfcb..010df04358aa 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -11,7 +11,17 @@
 
 struct cma;
 
+/*
+ * Values for struct dma_map_ops.flags:
+ *
+ * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can
+ * handle PCI P2PDMA pages in the map_sg/unmap_sg operation.
+ */
+#define DMA_F_PCI_P2PDMA_SUPPORTED     (1 << 0)
+
 struct dma_map_ops {
+	unsigned int flags;
+
 	void *(*alloc)(struct device *dev, size_t size,
 			dma_addr_t *dma_handle, gfp_t gfp,
 			unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index fe3849434b2a..25a30906289d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -140,6 +140,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
 int dma_supported(struct device *dev, u64 mask);
+bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
@@ -251,6 +252,10 @@ static inline int dma_supported(struct device *dev, u64 mask)
 {
 	return 0;
 }
+static inline bool dma_pci_p2pdma_supported(struct device *dev)
+{
+	return false;
+}
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
 	return -EIO;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 746d46825d08..a9ce5d728231 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -723,6 +723,24 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	/* if ops is not set, dma direct will be used which supports P2PDMA */
+	if (!ops)
+		return true;
+
+	/*
+	 * Note: dma_ops_bypass is not checked here because P2PDMA should
+	 * not be used with dma mapping ops that do not have support even
+	 * if the specific device is bypassing them.
+	 */
+
+	return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
+
 #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
 void arch_dma_set_mask(struct device *dev, u64 mask);
 #else
-- 
cgit v1.2.3


From 0d06132fc84bd91379300768c75a19474e06d0c5 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 8 Jul 2022 10:51:04 -0600
Subject: PCI/P2PDMA: Remove pci_p2pdma_[un]map_sg()

This interface is superseded by support in dma_map_sg() which now supports
heterogeneous scatterlists. There are no longer any users, so remove it.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/pci/p2pdma.c       | 66 ----------------------------------------------
 include/linux/pci-p2pdma.h | 27 -------------------
 2 files changed, 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 5d2538aa0778..4496a7c5c478 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -872,72 +872,6 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
 	return type;
 }
 
-static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
-		struct device *dev, struct scatterlist *sg, int nents)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nents, i) {
-		s->dma_address = sg_phys(s) + p2p_pgmap->bus_offset;
-		sg_dma_len(s) = s->length;
-	}
-
-	return nents;
-}
-
-/**
- * pci_p2pdma_map_sg_attrs - map a PCI peer-to-peer scatterlist for DMA
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: elements in the scatterlist
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_map_sg() (if called)
- *
- * Scatterlists mapped with this function should be unmapped using
- * pci_p2pdma_unmap_sg_attrs().
- *
- * Returns the number of SG entries mapped or 0 on error.
- */
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	struct pci_p2pdma_pagemap *p2p_pgmap =
-		to_p2p_pgmap(sg_page(sg)->pgmap);
-
-	switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev)) {
-	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-		return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
-	case PCI_P2PDMA_MAP_BUS_ADDR:
-		return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
-	default:
-		/* Mapping is not Supported */
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs);
-
-/**
- * pci_p2pdma_unmap_sg_attrs - unmap a PCI peer-to-peer scatterlist that was
- *	mapped with pci_p2pdma_map_sg()
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: number of elements returned by pci_p2pdma_map_sg()
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_unmap_sg() (if called)
- */
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-	enum pci_p2pdma_map_type map_type;
-
-	map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev);
-
-	if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
-		dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
-
 /**
  * pci_p2pdma_map_segment - map an sg segment determining the mapping type
  * @state: State structure that should be declared outside of the for_each_sg()
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 8318a97c9c61..2c07aa6b7665 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -30,10 +30,6 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
 					 unsigned int *nents, u32 length);
 void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
 int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 			    bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
@@ -83,17 +79,6 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
-static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-	return 0;
-}
-static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir,
-		unsigned long attrs)
-{
-}
 static inline int pci_p2pdma_enable_store(const char *page,
 		struct pci_dev **p2p_dev, bool *use_p2pdma)
 {
@@ -119,16 +104,4 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
 	return pci_p2pmem_find_many(&client, 1);
 }
 
-static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg,
-				    int nents, enum dma_data_direction dir)
-{
-	return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0);
-}
-
-static inline void pci_p2pdma_unmap_sg(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir)
-{
-	pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0);
-}
-
 #endif /* _LINUX_PCI_P2P_H */
-- 
cgit v1.2.3


From 019e442bb0d5e7f25ada8eb4252e22e62d17a95e Mon Sep 17 00:00:00 2001
From: Axe Yang <axe.yang@mediatek.com>
Date: Tue, 26 Jul 2022 14:28:41 +0800
Subject: mmc: core: Add support for SDIO wakeup interrupt

If wakeup-source flag is set in host dts node, parse EAI information
from SDIO CCCR interrupt externsion segment for in-band wakeup. If
async interrupt is supported by SDIO card then enable it and set
enable_async_irq flag in sdio_cccr structure to 1. The parse flow is
implemented in sdio_read_cccr().

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Axe Yang <axe.yang@mediatek.com>
Link: https://lore.kernel.org/r/20220726062842.18846-3-axe.yang@mediatek.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sdio.c  | 14 ++++++++++++++
 include/linux/mmc/card.h |  8 +++++++-
 include/linux/mmc/sdio.h |  5 +++++
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index b589df1c35e0..0b682a31cd3e 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -226,6 +226,20 @@ static int sdio_read_cccr(struct mmc_card *card, u32 ocr)
 				card->sw_caps.sd3_drv_type |= SD_DRIVER_TYPE_C;
 			if (data & SDIO_DRIVE_SDTD)
 				card->sw_caps.sd3_drv_type |= SD_DRIVER_TYPE_D;
+
+			ret = mmc_io_rw_direct(card, 0, 0, SDIO_CCCR_INTERRUPT_EXT, 0, &data);
+			if (ret)
+				goto out;
+
+			if (data & SDIO_INTERRUPT_EXT_SAI) {
+				data |= SDIO_INTERRUPT_EXT_EAI;
+				ret = mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_INTERRUPT_EXT,
+						       data, NULL);
+				if (ret)
+					goto out;
+
+				card->cccr.enable_async_irq = 1;
+			}
 		}
 
 		/* if no uhs mode ensure we check for high speed */
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 156a7b673a28..8a30de08e913 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -219,7 +219,8 @@ struct sdio_cccr {
 				wide_bus:1,
 				high_power:1,
 				high_speed:1,
-				disable_cd:1;
+				disable_cd:1,
+				enable_async_irq:1;
 };
 
 struct sdio_cis {
@@ -343,6 +344,11 @@ static inline bool mmc_large_sector(struct mmc_card *card)
 	return card->ext_csd.data_sector_size == 4096;
 }
 
+static inline int mmc_card_enable_async_irq(struct mmc_card *card)
+{
+	return card->cccr.enable_async_irq;
+}
+
 bool mmc_card_is_blockaddr(struct mmc_card *card);
 
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
diff --git a/include/linux/mmc/sdio.h b/include/linux/mmc/sdio.h
index 2a05d1ac4f0e..1ef400f28642 100644
--- a/include/linux/mmc/sdio.h
+++ b/include/linux/mmc/sdio.h
@@ -159,6 +159,11 @@
 #define  SDIO_DTSx_SET_TYPE_A	(1 << SDIO_DRIVE_DTSx_SHIFT)
 #define  SDIO_DTSx_SET_TYPE_C	(2 << SDIO_DRIVE_DTSx_SHIFT)
 #define  SDIO_DTSx_SET_TYPE_D	(3 << SDIO_DRIVE_DTSx_SHIFT)
+
+#define SDIO_CCCR_INTERRUPT_EXT	0x16
+#define SDIO_INTERRUPT_EXT_SAI	(1 << 0)
+#define SDIO_INTERRUPT_EXT_EAI	(1 << 1)
+
 /*
  * Function Basic Registers (FBR)
  */
-- 
cgit v1.2.3


From 04ad63f086d1a9649b8b082748cbc7a570ade461 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 11 Jan 2022 08:06:40 -0800
Subject: cxl/region: Introduce cxl_pmem_region objects

The LIBNVDIMM subsystem is a platform agnostic representation of system
NVDIMM / persistent memory resources. To date, the CXL subsystem's
interaction with LIBNVDIMM has been to register an nvdimm-bridge device
and cxl_nvdimm objects to proxy CXL capabilities into existing LIBNVDIMM
subsystem mechanics.

With regions the approach is the same. Create a new cxl_pmem_region
object to proxy CXL region details into a LIBNVDIMM definition. With
this enabling LIBNVDIMM can partition CXL persistent memory regions with
legacy namespace labels. A follow-on patch will add CXL region label and
CXL namespace label support to persist region configurations across
driver reload / system-reset events.

Co-developed-by: Ben Widawsky <bwidawsk@kernel.org>
Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/165784340111.1758207.3036498385188290968.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/core.h      |   3 +
 drivers/cxl/core/pmem.c      |   4 +-
 drivers/cxl/core/port.c      |   2 +
 drivers/cxl/core/region.c    | 142 +++++++++++++++++++++++++-
 drivers/cxl/cxl.h            |  36 ++++++-
 drivers/cxl/pmem.c           | 238 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/region_devs.c |  28 +++--
 include/linux/libnvdimm.h    |   5 +
 8 files changed, 446 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 391aadf9e7fa..1d8f87be283f 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -13,11 +13,13 @@ extern struct attribute_group cxl_base_attribute_group;
 extern struct device_attribute dev_attr_create_pmem_region;
 extern struct device_attribute dev_attr_delete_region;
 extern struct device_attribute dev_attr_region;
+extern const struct device_type cxl_pmem_region_type;
 extern const struct device_type cxl_region_type;
 void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
 #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
 #define CXL_REGION_TYPE(x) (&cxl_region_type)
 #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
+#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type)
 int cxl_region_init(void);
 void cxl_region_exit(void);
 #else
@@ -34,6 +36,7 @@ static inline void cxl_region_exit(void)
 #define CXL_REGION_ATTR(x) NULL
 #define CXL_REGION_TYPE(x) NULL
 #define SET_CXL_REGION_ATTR(x)
+#define CXL_PMEM_REGION_TYPE(x) NULL
 #endif
 
 struct cxl_send_command;
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index bec7cfb54ebf..1d12a8206444 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -62,9 +62,9 @@ static int match_nvdimm_bridge(struct device *dev, void *data)
 	return is_cxl_nvdimm_bridge(dev);
 }
 
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd)
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start)
 {
-	struct cxl_port *port = find_cxl_root(&cxl_nvd->dev);
+	struct cxl_port *port = find_cxl_root(start);
 	struct device *dev;
 
 	if (!port)
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index ba66ce25de30..3d2d0119cc3d 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -44,6 +44,8 @@ static int cxl_device_id(struct device *dev)
 		return CXL_DEVICE_NVDIMM_BRIDGE;
 	if (dev->type == &cxl_nvdimm_type)
 		return CXL_DEVICE_NVDIMM;
+	if (dev->type == CXL_PMEM_REGION_TYPE())
+		return CXL_DEVICE_PMEM_REGION;
 	if (is_cxl_port(dev)) {
 		if (is_cxl_root(to_cxl_port(dev)))
 			return CXL_DEVICE_ROOT;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index bf4e7f5499c1..dc71ec457608 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1650,6 +1650,139 @@ static ssize_t delete_region_store(struct device *dev,
 }
 DEVICE_ATTR_WO(delete_region);
 
+static void cxl_pmem_region_release(struct device *dev)
+{
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	int i;
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd;
+
+		put_device(&cxlmd->dev);
+	}
+
+	kfree(cxlr_pmem);
+}
+
+static const struct attribute_group *cxl_pmem_region_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_pmem_region_type = {
+	.name = "cxl_pmem_region",
+	.release = cxl_pmem_region_release,
+	.groups = cxl_pmem_region_attribute_groups,
+};
+
+bool is_cxl_pmem_region(struct device *dev)
+{
+	return dev->type == &cxl_pmem_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL);
+
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev),
+			  "not a cxl_pmem_region device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_pmem_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL);
+
+static struct lock_class_key cxl_pmem_region_key;
+
+static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int i;
+
+	down_read(&cxl_region_rwsem);
+	if (p->state != CXL_CONFIG_COMMIT) {
+		cxlr_pmem = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets),
+			    GFP_KERNEL);
+	if (!cxlr_pmem) {
+		cxlr_pmem = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	cxlr_pmem->hpa_range.start = p->res->start;
+	cxlr_pmem->hpa_range.end = p->res->end;
+
+	/* Snapshot the region configuration underneath the cxl_region_rwsem */
+	cxlr_pmem->nr_mappings = p->nr_targets;
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+
+		m->cxlmd = cxlmd;
+		get_device(&cxlmd->dev);
+		m->start = cxled->dpa_res->start;
+		m->size = resource_size(cxled->dpa_res);
+		m->position = i;
+	}
+
+	dev = &cxlr_pmem->dev;
+	cxlr_pmem->cxlr = cxlr;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_pmem_region_type;
+out:
+	up_read(&cxl_region_rwsem);
+
+	return cxlr_pmem;
+}
+
+static void cxlr_pmem_unregister(void *dev)
+{
+	device_unregister(dev);
+}
+
+/**
+ * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge
+ * @cxlr: parent CXL region for this pmem region bridge device
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int rc;
+
+	cxlr_pmem = cxl_pmem_region_alloc(cxlr);
+	if (IS_ERR(cxlr_pmem))
+		return PTR_ERR(cxlr_pmem);
+
+	dev = &cxlr_pmem->dev;
+	rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev);
+
+err:
+	put_device(dev);
+	return rc;
+}
+
 static int cxl_region_probe(struct device *dev)
 {
 	struct cxl_region *cxlr = to_cxl_region(dev);
@@ -1673,7 +1806,14 @@ static int cxl_region_probe(struct device *dev)
 	 */
 	up_read(&cxl_region_rwsem);
 
-	return rc;
+	switch (cxlr->mode) {
+	case CXL_DECODER_PMEM:
+		return devm_cxl_add_pmem_region(cxlr);
+	default:
+		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
+			cxlr->mode);
+		return -ENXIO;
+	}
 }
 
 static struct cxl_driver cxl_region_driver = {
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 46d3f173a700..75674400cc8d 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -419,6 +419,25 @@ struct cxl_nvdimm {
 	struct device dev;
 	struct cxl_memdev *cxlmd;
 	struct cxl_nvdimm_bridge *bridge;
+	struct cxl_pmem_region *region;
+};
+
+struct cxl_pmem_region_mapping {
+	struct cxl_memdev *cxlmd;
+	struct cxl_nvdimm *cxl_nvd;
+	u64 start;
+	u64 size;
+	int position;
+};
+
+struct cxl_pmem_region {
+	struct device dev;
+	struct cxl_region *cxlr;
+	struct nd_region *nd_region;
+	struct cxl_nvdimm_bridge *bridge;
+	struct range hpa_range;
+	int nr_mappings;
+	struct cxl_pmem_region_mapping mapping[];
 };
 
 /**
@@ -601,6 +620,7 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv);
 #define CXL_DEVICE_ROOT			4
 #define CXL_DEVICE_MEMORY_EXPANDER	5
 #define CXL_DEVICE_REGION		6
+#define CXL_DEVICE_PMEM_REGION		7
 
 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*")
 #define CXL_MODALIAS_FMT "cxl:t%d"
@@ -612,7 +632,21 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm_bridge(struct device *dev);
 int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev);
+
+#ifdef CONFIG_CXL_REGION
+bool is_cxl_pmem_region(struct device *dev);
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
+#else
+static inline bool is_cxl_pmem_region(struct device *dev)
+{
+	return false;
+}
+static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	return NULL;
+}
+#endif
 
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index b271f6e90b91..e69f99a0747d 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -7,6 +7,7 @@
 #include <linux/ndctl.h>
 #include <linux/async.h>
 #include <linux/slab.h>
+#include <linux/nd.h>
 #include "cxlmem.h"
 #include "cxl.h"
 
@@ -27,6 +28,19 @@ static void clear_exclusive(void *cxlds)
 static void unregister_nvdimm(void *nvdimm)
 {
 	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
+	struct cxl_pmem_region *cxlr_pmem;
+
+	device_lock(&cxl_nvb->dev);
+	cxlr_pmem = cxl_nvd->region;
+	dev_set_drvdata(&cxl_nvd->dev, NULL);
+	cxl_nvd->region = NULL;
+	device_unlock(&cxl_nvb->dev);
+
+	if (cxlr_pmem) {
+		device_release_driver(&cxlr_pmem->dev);
+		put_device(&cxlr_pmem->dev);
+	}
 
 	nvdimm_delete(nvdimm);
 	cxl_nvd->bridge = NULL;
@@ -42,7 +56,7 @@ static int cxl_nvdimm_probe(struct device *dev)
 	struct nvdimm *nvdimm;
 	int rc;
 
-	cxl_nvb = cxl_find_nvdimm_bridge(cxl_nvd);
+	cxl_nvb = cxl_find_nvdimm_bridge(dev);
 	if (!cxl_nvb)
 		return -ENXIO;
 
@@ -223,6 +237,21 @@ static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb)
 	return 0;
 }
 
+static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+
+	if (!is_cxl_pmem_region(dev))
+		return 0;
+
+	cxlr_pmem = to_cxl_pmem_region(dev);
+	if (cxlr_pmem->bridge != cxl_nvb)
+		return 0;
+
+	device_release_driver(dev);
+	return 0;
+}
+
 static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
 			       struct nvdimm_bus *nvdimm_bus)
 {
@@ -234,6 +263,8 @@ static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
 	 * nvdimm_bus_unregister() rips the nvdimm objects out from
 	 * underneath them.
 	 */
+	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
+			 cxl_pmem_region_release_driver);
 	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
 			 cxl_nvdimm_release_driver);
 	nvdimm_bus_unregister(nvdimm_bus);
@@ -328,6 +359,203 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = {
 	.id = CXL_DEVICE_NVDIMM_BRIDGE,
 };
 
+static int match_cxl_nvdimm(struct device *dev, void *data)
+{
+	return is_cxl_nvdimm(dev);
+}
+
+static void unregister_nvdimm_region(void *nd_region)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_pmem_region *cxlr_pmem;
+	int i;
+
+	cxlr_pmem = nd_region_provider_data(nd_region);
+	cxl_nvb = cxlr_pmem->bridge;
+	device_lock(&cxl_nvb->dev);
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;
+
+		if (cxl_nvd->region) {
+			put_device(&cxlr_pmem->dev);
+			cxl_nvd->region = NULL;
+		}
+	}
+	device_unlock(&cxl_nvb->dev);
+
+	nvdimm_region_delete(nd_region);
+}
+
+static void cxlr_pmem_remove_resource(void *res)
+{
+	remove_resource(res);
+}
+
+struct cxl_pmem_region_info {
+	u64 offset;
+	u64 serial;
+};
+
+static int cxl_pmem_region_probe(struct device *dev)
+{
+	struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	struct cxl_region *cxlr = cxlr_pmem->cxlr;
+	struct cxl_pmem_region_info *info = NULL;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct nd_interleave_set *nd_set;
+	struct nd_region_desc ndr_desc;
+	struct cxl_nvdimm *cxl_nvd;
+	struct nvdimm *nvdimm;
+	struct resource *res;
+	int rc, i = 0;
+
+	cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev);
+	if (!cxl_nvb) {
+		dev_dbg(dev, "bridge not found\n");
+		return -ENXIO;
+	}
+	cxlr_pmem->bridge = cxl_nvb;
+
+	device_lock(&cxl_nvb->dev);
+	if (!cxl_nvb->nvdimm_bus) {
+		dev_dbg(dev, "nvdimm bus not found\n");
+		rc = -ENXIO;
+		goto err;
+	}
+
+	memset(&mappings, 0, sizeof(mappings));
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+
+	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	res->name = "Persistent Memory";
+	res->start = cxlr_pmem->hpa_range.start;
+	res->end = cxlr_pmem->hpa_range.end;
+	res->flags = IORESOURCE_MEM;
+	res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+	rc = insert_resource(&iomem_resource, res);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+	if (rc)
+		goto err;
+
+	ndr_desc.res = res;
+	ndr_desc.provider_data = cxlr_pmem;
+
+	ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
+	ndr_desc.target_node = phys_to_target_node(res->start);
+	if (ndr_desc.target_node == NUMA_NO_NODE) {
+		ndr_desc.target_node = ndr_desc.numa_node;
+		dev_dbg(&cxlr->dev, "changing target node from %d to %d",
+			NUMA_NO_NODE, ndr_desc.target_node);
+	}
+
+	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+	if (!nd_set) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ndr_desc.memregion = cxlr->id;
+	set_bit(ND_REGION_CXL, &ndr_desc.flags);
+	set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+
+	info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_memdev *cxlmd = m->cxlmd;
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
+		struct device *d;
+
+		d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm);
+		if (!d) {
+			dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+
+		/* safe to drop ref now with bridge lock held */
+		put_device(d);
+
+		cxl_nvd = to_cxl_nvdimm(d);
+		nvdimm = dev_get_drvdata(&cxl_nvd->dev);
+		if (!nvdimm) {
+			dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+		cxl_nvd->region = cxlr_pmem;
+		get_device(&cxlr_pmem->dev);
+		m->cxl_nvd = cxl_nvd;
+		mappings[i] = (struct nd_mapping_desc) {
+			.nvdimm = nvdimm,
+			.start = m->start,
+			.size = m->size,
+			.position = i,
+		};
+		info[i].offset = m->start;
+		info[i].serial = cxlds->serial;
+	}
+	ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
+	ndr_desc.mapping = mappings;
+
+	/*
+	 * TODO enable CXL labels which skip the need for 'interleave-set cookie'
+	 */
+	nd_set->cookie1 =
+		nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
+	nd_set->cookie2 = nd_set->cookie1;
+	ndr_desc.nd_set = nd_set;
+
+	cxlr_pmem->nd_region =
+		nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+	if (IS_ERR(cxlr_pmem->nd_region)) {
+		rc = PTR_ERR(cxlr_pmem->nd_region);
+		goto err;
+	}
+
+	rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+				      cxlr_pmem->nd_region);
+out:
+	kfree(info);
+	device_unlock(&cxl_nvb->dev);
+	put_device(&cxl_nvb->dev);
+
+	return rc;
+
+err:
+	dev_dbg(dev, "failed to create nvdimm region\n");
+	for (i--; i >= 0; i--) {
+		nvdimm = mappings[i].nvdimm;
+		cxl_nvd = nvdimm_provider_data(nvdimm);
+		put_device(&cxl_nvd->region->dev);
+		cxl_nvd->region = NULL;
+	}
+	goto out;
+}
+
+static struct cxl_driver cxl_pmem_region_driver = {
+	.name = "cxl_pmem_region",
+	.probe = cxl_pmem_region_probe,
+	.id = CXL_DEVICE_PMEM_REGION,
+};
+
 /*
  * Return all bridges to the CXL_NVB_NEW state to invalidate any
  * ->state_work referring to the now destroyed cxl_pmem_wq.
@@ -372,8 +600,14 @@ static __init int cxl_pmem_init(void)
 	if (rc)
 		goto err_nvdimm;
 
+	rc = cxl_driver_register(&cxl_pmem_region_driver);
+	if (rc)
+		goto err_region;
+
 	return 0;
 
+err_region:
+	cxl_driver_unregister(&cxl_nvdimm_driver);
 err_nvdimm:
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 err_bridge:
@@ -383,6 +617,7 @@ err_bridge:
 
 static __exit void cxl_pmem_exit(void)
 {
+	cxl_driver_unregister(&cxl_pmem_region_driver);
 	cxl_driver_unregister(&cxl_nvdimm_driver);
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 	destroy_cxl_pmem_wq();
@@ -394,3 +629,4 @@ module_exit(cxl_pmem_exit);
 MODULE_IMPORT_NS(CXL);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
+MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index d976260eca7a..473a71bbd9c9 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -133,7 +133,8 @@ static void nd_region_release(struct device *dev)
 		put_device(&nvdimm->dev);
 	}
 	free_percpu(nd_region->lane);
-	memregion_free(nd_region->id);
+	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
+		memregion_free(nd_region->id);
 	kfree(nd_region);
 }
 
@@ -982,9 +983,14 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	if (!nd_region)
 		return NULL;
-	nd_region->id = memregion_alloc(GFP_KERNEL);
-	if (nd_region->id < 0)
-		goto err_id;
+	/* CXL pre-assigns memregion ids before creating nvdimm regions */
+	if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) {
+		nd_region->id = ndr_desc->memregion;
+	} else {
+		nd_region->id = memregion_alloc(GFP_KERNEL);
+		if (nd_region->id < 0)
+			goto err_id;
+	}
 
 	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
 	if (!nd_region->lane)
@@ -1043,9 +1049,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	return nd_region;
 
- err_percpu:
-	memregion_free(nd_region->id);
- err_id:
+err_percpu:
+	if (!test_bit(ND_REGION_CXL, &ndr_desc->flags))
+		memregion_free(nd_region->id);
+err_id:
 	kfree(nd_region);
 	return NULL;
 }
@@ -1068,6 +1075,13 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+void nvdimm_region_delete(struct nd_region *nd_region)
+{
+	if (nd_region)
+		nd_device_unregister(&nd_region->dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_delete);
+
 int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
 {
 	int rc = 0;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0d61e07b6827..c74acfa1a3fe 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -59,6 +59,9 @@ enum {
 	/* Platform provides asynchronous flush mechanism */
 	ND_REGION_ASYNC = 3,
 
+	/* Region was created by CXL subsystem */
+	ND_REGION_CXL = 4,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -122,6 +125,7 @@ struct nd_region_desc {
 	int numa_node;
 	int target_node;
 	unsigned long flags;
+	int memregion;
 	struct device_node *of_node;
 	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
@@ -259,6 +263,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 			cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL);
 }
 void nvdimm_delete(struct nvdimm *nvdimm);
+void nvdimm_region_delete(struct nd_region *nd_region);
 
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
-- 
cgit v1.2.3


From 04b9407197789c81fffac52921e703cb47967d6a Mon Sep 17 00:00:00 2001
From: Daniil Lunev <dlunev@chromium.org>
Date: Wed, 27 Jul 2022 16:44:24 +1000
Subject: vfs: function to prevent re-use of block-device-based superblocks

The function is to be called from filesystem-specific code to mark a
superblock to be ignored by superblock test and thus never re-used.  The
function also unregisters bdi if the bdi is per-superblock to avoid
collision if a new superblock is created to represent the filesystem.
generic_shutdown_super() skips unregistering bdi for a retired superlock as
it assumes retire function has already done it.

This patch adds the functionality only for the block-device-based supers,
since the primary use case of the feature is to gracefully handle force
unmount of external devices, mounted with FUSE.  This can be further
extended to cover all superblocks, if the need arises.

Signed-off-by: Daniil Lunev <dlunev@chromium.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/super.c         | 33 +++++++++++++++++++++++++++++++--
 include/linux/fs.h |  2 ++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 60f57c7bc0a6..258d9257b0a7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -422,6 +422,35 @@ bool trylock_super(struct super_block *sb)
 	return false;
 }
 
+/**
+ *	retire_super	-	prevents superblock from being reused
+ *	@sb: superblock to retire
+ *
+ *	The function marks superblock to be ignored in superblock test, which
+ *	prevents it from being reused for any new mounts.  If the superblock has
+ *	a private bdi, it also unregisters it, but doesn't reduce the refcount
+ *	of the superblock to prevent potential races.  The refcount is reduced
+ *	by generic_shutdown_super().  The function can not be called
+ *	concurrently with generic_shutdown_super().  It is safe to call the
+ *	function multiple times, subsequent calls have no effect.
+ *
+ *	The marker will affect the re-use only for block-device-based
+ *	superblocks.  Other superblocks will still get marked if this function
+ *	is used, but that will not affect their reusability.
+ */
+void retire_super(struct super_block *sb)
+{
+	WARN_ON(!sb->s_bdev);
+	down_write(&sb->s_umount);
+	if (sb->s_iflags & SB_I_PERSB_BDI) {
+		bdi_unregister(sb->s_bdi);
+		sb->s_iflags &= ~SB_I_PERSB_BDI;
+	}
+	sb->s_iflags |= SB_I_RETIRED;
+	up_write(&sb->s_umount);
+}
+EXPORT_SYMBOL(retire_super);
+
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 
 static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
 {
-	return s->s_bdev == fc->sget_key;
+	return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
 }
 
 /**
@@ -1307,7 +1336,7 @@ EXPORT_SYMBOL(get_tree_bdev);
 
 static int test_bdev_super(struct super_block *s, void *data)
 {
-	return (void *)s->s_bdev == data;
+	return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
 }
 
 struct dentry *mount_bdev(struct file_system_type *fs_type,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..246120ee0573 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1412,6 +1412,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
+#define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -2432,6 +2433,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
+void retire_super(struct super_block *sb);
 void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
-- 
cgit v1.2.3


From 7c56a8733d0a2a4be2438a7512566e5ce552fccf Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Wed, 13 Jul 2022 17:47:27 +0200
Subject: watchdog: export lockup_detector_reconfigure

In some circumstances it may be interesting to reconfigure the watchdog
from inside the kernel.

On PowerPC, this may helpful before and after a LPAR migration (LPM) is
initiated, because it implies some latencies, watchdog, and especially NMI
watchdog is expected to be triggered during this operation. Reconfiguring
the watchdog with a factor, would prevent it to happen too frequently
during LPM.

Rename lockup_detector_reconfigure() as __lockup_detector_reconfigure() and
create a new function lockup_detector_reconfigure() calling
__lockup_detector_reconfigure() under the protection of watchdog_mutex.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
[mpe: Squash in build fix from Laurent, reported by Sachin]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220713154729.80789-3-ldufour@linux.ibm.com
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 750c7f395ca9..f700ff2df074 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -122,6 +122,8 @@ int watchdog_nmi_probe(void);
 int watchdog_nmi_enable(unsigned int cpu);
 void watchdog_nmi_disable(unsigned int cpu);
 
+void lockup_detector_reconfigure(void);
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  *
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 20a7a55e62b6..41596c415111 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -541,7 +541,7 @@ int lockup_detector_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -561,6 +561,13 @@ static void lockup_detector_reconfigure(void)
 	__lockup_detector_cleanup();
 }
 
+void lockup_detector_reconfigure(void)
+{
+	mutex_lock(&watchdog_mutex);
+	__lockup_detector_reconfigure();
+	mutex_unlock(&watchdog_mutex);
+}
+
 /*
  * Create the watchdog infrastructure and configure the detector(s).
  */
@@ -577,13 +584,13 @@ static __init void lockup_detector_setup(void)
 		return;
 
 	mutex_lock(&watchdog_mutex);
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 	softlockup_initialized = true;
 	mutex_unlock(&watchdog_mutex);
 }
 
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(void)
 {
 	cpus_read_lock();
 	watchdog_nmi_stop();
@@ -591,9 +598,13 @@ static void lockup_detector_reconfigure(void)
 	watchdog_nmi_start();
 	cpus_read_unlock();
 }
+void lockup_detector_reconfigure(void)
+{
+	__lockup_detector_reconfigure();
+}
 static inline void lockup_detector_setup(void)
 {
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
@@ -633,7 +644,7 @@ static void proc_watchdog_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-	lockup_detector_reconfigure();
+	__lockup_detector_reconfigure();
 }
 
 /*
-- 
cgit v1.2.3


From 0113780870b1597ae49f30abfa4957c239f913d3 Mon Sep 17 00:00:00 2001
From: Aharon Landau <aharonl@nvidia.com>
Date: Tue, 26 Jul 2022 10:19:11 +0300
Subject: RDMA/mlx5: Rename the mkey cache variables and functions

After replacing the MR cache with an Mkey cache, rename the variables and
functions to fit the new meaning.

Link: https://lore.kernel.org/r/20220726071911.122765-6-michaelgur@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |  4 +--
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 14 +++++-----
 drivers/infiniband/hw/mlx5/mr.c      | 54 ++++++++++++++++++------------------
 drivers/infiniband/hw/mlx5/odp.c     |  2 +-
 include/linux/mlx5/driver.h          |  6 ++--
 5 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b68fddeac0f1..a174a0eee8dc 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4002,7 +4002,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mr_cache_cleanup(dev);
+	err = mlx5_mkey_cache_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4022,7 +4022,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	if (ret)
 		return ret;
 
-	ret = mlx5_mr_cache_init(dev);
+	ret = mlx5_mkey_cache_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		mlx5r_umr_resource_cleanup(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 91f985cd7d90..2e2ad3918385 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -764,9 +764,9 @@ struct mlx5r_async_create_mkey {
 	u32 mkey;
 };
 
-struct mlx5_mr_cache {
+struct mlx5_mkey_cache {
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	struct mlx5_cache_ent	ent[MAX_MKEY_CACHE_ENTRIES];
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -1065,7 +1065,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mr_cache		cache;
+	struct mlx5_mkey_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1310,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       struct mlx5_cache_ent *ent,
@@ -1339,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1358,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index edbc2990d151..129d531bd01b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -119,7 +119,7 @@ static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
 				&async_create->cb_work);
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -515,11 +515,11 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mkey_cache *cache)
 {
 	unsigned int i;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &cache->ent[i];
 		bool ret;
 
@@ -569,7 +569,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	int err;
 
 	xa_lock_irq(&ent->mkeys);
@@ -681,7 +681,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
 	u32 mkey;
 
@@ -696,7 +696,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 	xa_unlock_irq(&ent->mkeys);
 }
 
-static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -705,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct dentry *dir;
 	int i;
@@ -717,7 +717,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 
 	cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		sprintf(ent->name, "%d", ent->order);
 		dir = debugfs_create_dir(ent->name, cache->root);
@@ -735,9 +735,9 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int i;
 
@@ -750,7 +750,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ);
 		ent->order = i + 2;
@@ -759,12 +759,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
-		if (i > MR_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mr_cache_entry(ent);
+		if (i > MKEY_CACHE_LAST_STD_ENTRY) {
+			mlx5_odp_init_mkey_cache_entry(ent);
 			continue;
 		}
 
-		if (ent->order > mr_cache_max_order(dev))
+		if (ent->order > mkey_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
@@ -781,19 +781,19 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		xa_unlock_irq(&ent->mkeys);
 	}
 
-	mlx5_mr_cache_debugfs_init(dev);
+	mlx5_mkey_cache_debugfs_init(dev);
 
 	return 0;
 }
 
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 
 		xa_lock_irq(&ent->mkeys);
@@ -802,10 +802,10 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+	for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++)
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
@@ -872,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MR_CACHE_LAST_STD_ENTRY + 2;
+		return MKEY_CACHE_LAST_STD_ENTRY + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
-						      unsigned int order)
+static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev,
+							unsigned int order)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 
 	if (order < cache->ent[0].order)
 		return &cache->ent[0];
 	order = order - cache->ent[0].order;
-	if (order > MR_CACHE_LAST_STD_ENTRY)
+	if (order > MKEY_CACHE_LAST_STD_ENTRY)
 		return NULL;
 	return &cache->ent[order];
 }
@@ -930,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mr_cache_ent_from_order(
+	ent = mkey_cache_ent_from_order(
 		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
 	/*
 	 * Matches access in alloc_cache_mr(). If the MR can't come from the
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 84da5674e1ab..e305bf1dc6c2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent)
 {
 	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 		return;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 76d7661e3e63..220597c2f436 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -728,10 +728,10 @@ enum {
 };
 
 enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
+	MKEY_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
+	MAX_MKEY_CACHE_ENTRIES
 };
 
 struct mlx5_profile {
@@ -740,7 +740,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 struct mlx5_hca_cap {
-- 
cgit v1.2.3


From 103e10c69c611efabccf57d799c4b191d53ee765 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 22 Jul 2022 09:57:46 +0300
Subject: ACPI: property: Add support for parsing buffer property UUID

Add support for newly added buffer property UUID, as defined in the DSD
guide section 3.3 [1]

Link: https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc#buffer-data-extension-uuid # [1]
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 142 +++++++++++++++++++++++++++++++++++++++++++-----
 include/acpi/acpi_bus.h |   3 +-
 include/linux/acpi.h    |   2 +-
 3 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index b751eb98106c..7e3d163a6b1f 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -55,14 +55,19 @@ static const guid_t ads_guid =
 	GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6,
 		  0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b);
 
+static const guid_t buffer_prop_guid =
+	GUID_INIT(0xedb12dd0, 0x363d, 0x4085,
+		  0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4);
+
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
-					   const union acpi_object *desc,
+					   union acpi_object *desc,
 					   struct acpi_device_data *data,
 					   struct fwnode_handle *parent);
-static bool acpi_extract_properties(const union acpi_object *desc,
+static bool acpi_extract_properties(acpi_handle handle,
+				    union acpi_object *desc,
 				    struct acpi_device_data *data);
 
-static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
+static bool acpi_nondev_subnode_extract(union acpi_object *desc,
 					acpi_handle handle,
 					const union acpi_object *link,
 					struct list_head *list,
@@ -81,7 +86,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 	INIT_LIST_HEAD(&dn->data.properties);
 	INIT_LIST_HEAD(&dn->data.subnodes);
 
-	result = acpi_extract_properties(desc, &dn->data);
+	result = acpi_extract_properties(handle, desc, &dn->data);
 
 	if (handle) {
 		acpi_handle scope;
@@ -156,7 +161,7 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
 }
 
 static bool acpi_add_nondev_subnodes(acpi_handle scope,
-				     const union acpi_object *links,
+				     union acpi_object *links,
 				     struct list_head *list,
 				     struct fwnode_handle *parent)
 {
@@ -164,7 +169,7 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope,
 	int i;
 
 	for (i = 0; i < links->package.count; i++) {
-		const union acpi_object *link, *desc;
+		union acpi_object *link, *desc;
 		acpi_handle handle;
 		bool result;
 
@@ -204,7 +209,7 @@ static bool acpi_add_nondev_subnodes(acpi_handle scope,
 }
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
-					   const union acpi_object *desc,
+					   union acpi_object *desc,
 					   struct acpi_device_data *data,
 					   struct fwnode_handle *parent)
 {
@@ -212,7 +217,8 @@ static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 
 	/* Look for the ACPI data subnodes GUID. */
 	for (i = 0; i < desc->package.count; i += 2) {
-		const union acpi_object *guid, *links;
+		const union acpi_object *guid;
+		union acpi_object *links;
 
 		guid = &desc->package.elements[i];
 		links = &desc->package.elements[i + 1];
@@ -325,7 +331,7 @@ static bool acpi_is_property_guid(const guid_t *guid)
 
 struct acpi_device_properties *
 acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
-		    const union acpi_object *properties)
+		    union acpi_object *properties)
 {
 	struct acpi_device_properties *props;
 
@@ -377,7 +383,104 @@ static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data)
 	return true;
 }
 
-static bool acpi_extract_properties(const union acpi_object *desc,
+static void acpi_data_add_buffer_props(acpi_handle handle,
+				       struct acpi_device_data *data,
+				       union acpi_object *properties)
+{
+	struct acpi_device_properties *props;
+	union acpi_object *package;
+	size_t alloc_size;
+	unsigned int i;
+	u32 *count;
+
+	if (check_mul_overflow((size_t)properties->package.count,
+			       sizeof(*package) + sizeof(void *),
+			       &alloc_size) ||
+	    check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size,
+			       &alloc_size)) {
+		acpi_handle_warn(handle,
+				 "can't allocate memory for %u buffer props",
+				 properties->package.count);
+		return;
+	}
+
+	props = kvzalloc(alloc_size, GFP_KERNEL);
+	if (!props)
+		return;
+
+	props->guid = &buffer_prop_guid;
+	props->bufs = (void *)(props + 1);
+	props->properties = (void *)(props->bufs + properties->package.count);
+
+	/* Outer package */
+	package = props->properties;
+	package->type = ACPI_TYPE_PACKAGE;
+	package->package.elements = package + 1;
+	count = &package->package.count;
+	*count = 0;
+
+	/* Inner packages */
+	package++;
+
+	for (i = 0; i < properties->package.count; i++) {
+		struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
+		union acpi_object *property = &properties->package.elements[i];
+		union acpi_object *prop, *obj, *buf_obj;
+		acpi_status status;
+
+		if (property->type != ACPI_TYPE_PACKAGE ||
+		    property->package.count != 2) {
+			acpi_handle_warn(handle,
+					 "buffer property %u has %u entries\n",
+					 i, property->package.count);
+			continue;
+		}
+
+		prop = &property->package.elements[0];
+		obj = &property->package.elements[1];
+
+		if (prop->type != ACPI_TYPE_STRING ||
+		    obj->type != ACPI_TYPE_STRING) {
+			acpi_handle_warn(handle,
+					 "wrong object types %u and %u\n",
+					 prop->type, obj->type);
+			continue;
+		}
+
+		status = acpi_evaluate_object_typed(handle, obj->string.pointer,
+						    NULL, &buf,
+						    ACPI_TYPE_BUFFER);
+		if (ACPI_FAILURE(status)) {
+			acpi_handle_warn(handle,
+					 "can't evaluate \"%*pE\" as buffer\n",
+					 obj->string.length,
+					 obj->string.pointer);
+			continue;
+		}
+
+		package->type = ACPI_TYPE_PACKAGE;
+		package->package.elements = prop;
+		package->package.count = 2;
+
+		buf_obj = buf.pointer;
+
+		/* Replace the string object with a buffer object */
+		obj->type = ACPI_TYPE_BUFFER;
+		obj->buffer.length = buf_obj->buffer.length;
+		obj->buffer.pointer = buf_obj->buffer.pointer;
+
+		props->bufs[i] = buf.pointer;
+		package++;
+		(*count)++;
+	}
+
+	if (*count)
+		list_add(&props->list, &data->properties);
+	else
+		kvfree(props);
+}
+
+static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc,
 				    struct acpi_device_data *data)
 {
 	int i;
@@ -387,7 +490,8 @@ static bool acpi_extract_properties(const union acpi_object *desc,
 
 	/* Look for the device properties GUID. */
 	for (i = 0; i < desc->package.count; i += 2) {
-		const union acpi_object *guid, *properties;
+		const union acpi_object *guid;
+		union acpi_object *properties;
 
 		guid = &desc->package.elements[i];
 		properties = &desc->package.elements[i + 1];
@@ -401,6 +505,12 @@ static bool acpi_extract_properties(const union acpi_object *desc,
 		    properties->type != ACPI_TYPE_PACKAGE)
 			break;
 
+		if (guid_equal((guid_t *)guid->buffer.pointer,
+			       &buffer_prop_guid)) {
+			acpi_data_add_buffer_props(scope, data, properties);
+			continue;
+		}
+
 		if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer))
 			continue;
 
@@ -447,7 +557,7 @@ void acpi_init_properties(struct acpi_device *adev)
 	if (ACPI_FAILURE(status))
 		goto out;
 
-	if (acpi_extract_properties(buf.pointer, &adev->data)) {
+	if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) {
 		adev->data.pointer = buf.pointer;
 		if (acpi_of)
 			acpi_init_of_compatible(adev);
@@ -477,8 +587,14 @@ static void acpi_free_device_properties(struct list_head *list)
 	struct acpi_device_properties *props, *tmp;
 
 	list_for_each_entry_safe(props, tmp, list, list) {
+		u32 i;
+
 		list_del(&props->list);
-		kfree(props);
+		/* Buffer data properties were separately allocated */
+		if (props->bufs)
+			for (i = 0; i < props->properties->package.count; i++)
+				ACPI_FREE(props->bufs[i]);
+		kvfree(props);
 	}
 }
 
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 0dc1ea0b52f5..a1690000ac95 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -344,8 +344,9 @@ struct acpi_device_physical_node {
 
 struct acpi_device_properties {
 	const guid_t *guid;
-	const union acpi_object *properties;
+	union acpi_object *properties;
 	struct list_head list;
+	void **bufs;
 };
 
 /* ACPI Device Specific Data (_DSD) */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 44975c1bbe12..7c46f152106b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1243,7 +1243,7 @@ static inline bool acpi_dev_has_props(const struct acpi_device *adev)
 
 struct acpi_device_properties *
 acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
-		    const union acpi_object *properties);
+		    union acpi_object *properties);
 
 int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
-- 
cgit v1.2.3


From 72691a269f0baad6d5f4aa7af97c29081b86d70f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 27 Jul 2022 13:02:27 -0400
Subject: SUNRPC: Don't reuse bvec on retransmission of the request

If a request is re-encoded and then retransmitted, we need to make sure
that we also re-encode the bvec, in case the page lists have changed.

Fixes: ff053dbbaffe ("SUNRPC: Move the call to xprt_send_pagedata() out of xprt_sock_sendmsg()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  3 ++-
 net/sunrpc/clnt.c           |  1 -
 net/sunrpc/xprt.c           | 27 ++++++++++++++++++---------
 net/sunrpc/xprtsock.c       | 12 ++----------
 4 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 0d51b9f9ea37..b9f59aabee53 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -144,7 +144,8 @@ struct rpc_xprt_ops {
 	unsigned short	(*get_srcport)(struct rpc_xprt *xprt);
 	int		(*buf_alloc)(struct rpc_task *task);
 	void		(*buf_free)(struct rpc_task *task);
-	int		(*prepare_request)(struct rpc_rqst *req);
+	int		(*prepare_request)(struct rpc_rqst *req,
+					   struct xdr_buf *buf);
 	int		(*send_request)(struct rpc_rqst *req);
 	void		(*wait_for_reply_request)(struct rpc_task *task);
 	void		(*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index bbfc47f03480..b098e707ad41 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1870,7 +1870,6 @@ rpc_xdr_encode(struct rpc_task *task)
 	req->rq_snd_buf.head[0].iov_len = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf,
 			req->rq_snd_buf.head[0].iov_base, req);
-	xdr_free_bvec(&req->rq_snd_buf);
 	if (rpc_encode_header(task, &xdr))
 		return;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 44348c9f4b00..d71eec494826 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -73,7 +73,7 @@ static void	xprt_init(struct rpc_xprt *xprt, struct net *net);
 static __be32	xprt_alloc_xid(struct rpc_xprt *xprt);
 static void	xprt_destroy(struct rpc_xprt *xprt);
 static void	xprt_request_init(struct rpc_task *task);
-static int	xprt_request_prepare(struct rpc_rqst *req);
+static int	xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf);
 
 static DEFINE_SPINLOCK(xprt_list_lock);
 static LIST_HEAD(xprt_list);
@@ -1149,7 +1149,7 @@ xprt_request_enqueue_receive(struct rpc_task *task)
 	if (!xprt_request_need_enqueue_receive(task, req))
 		return 0;
 
-	ret = xprt_request_prepare(task->tk_rqstp);
+	ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf);
 	if (ret)
 		return ret;
 	spin_lock(&xprt->queue_lock);
@@ -1179,8 +1179,11 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
 
-	if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate))
+	if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) {
 		xprt_request_rb_remove(req->rq_xprt, req);
+		xdr_free_bvec(&req->rq_rcv_buf);
+		req->rq_private_buf.bvec = NULL;
+	}
 }
 
 /**
@@ -1336,8 +1339,14 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 {
 	struct rpc_rqst *pos, *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
+	int ret;
 
 	if (xprt_request_need_enqueue_transmit(task, req)) {
+		ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf);
+		if (ret) {
+			task->tk_status = ret;
+			return;
+		}
 		req->rq_bytes_sent = 0;
 		spin_lock(&xprt->queue_lock);
 		/*
@@ -1397,6 +1406,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
 	} else
 		list_del(&req->rq_xmit2);
 	atomic_long_dec(&req->rq_xprt->xmit_queuelen);
+	xdr_free_bvec(&req->rq_snd_buf);
 }
 
 /**
@@ -1433,8 +1443,6 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 	    test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
 	    xprt_is_pinned_rqst(req)) {
 		spin_lock(&xprt->queue_lock);
-		xprt_request_dequeue_transmit_locked(task);
-		xprt_request_dequeue_receive_locked(task);
 		while (xprt_is_pinned_rqst(req)) {
 			set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
 			spin_unlock(&xprt->queue_lock);
@@ -1442,6 +1450,8 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 			spin_lock(&xprt->queue_lock);
 			clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
 		}
+		xprt_request_dequeue_transmit_locked(task);
+		xprt_request_dequeue_receive_locked(task);
 		spin_unlock(&xprt->queue_lock);
 	}
 }
@@ -1449,18 +1459,19 @@ xprt_request_dequeue_xprt(struct rpc_task *task)
 /**
  * xprt_request_prepare - prepare an encoded request for transport
  * @req: pointer to rpc_rqst
+ * @buf: pointer to send/rcv xdr_buf
  *
  * Calls into the transport layer to do whatever is needed to prepare
  * the request for transmission or receive.
  * Returns error, or zero.
  */
 static int
-xprt_request_prepare(struct rpc_rqst *req)
+xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf)
 {
 	struct rpc_xprt *xprt = req->rq_xprt;
 
 	if (xprt->ops->prepare_request)
-		return xprt->ops->prepare_request(req);
+		return xprt->ops->prepare_request(req, buf);
 	return 0;
 }
 
@@ -1961,8 +1972,6 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(task);
-	xdr_free_bvec(&req->rq_rcv_buf);
-	xdr_free_bvec(&req->rq_snd_buf);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
 	if (req->rq_release_snd_buf)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index eba1be9984f8..e976007f4fd0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -822,17 +822,9 @@ static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait)
 	return ret;
 }
 
-static int
-xs_stream_prepare_request(struct rpc_rqst *req)
+static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf)
 {
-	gfp_t gfp = rpc_task_gfp_mask();
-	int ret;
-
-	ret = xdr_alloc_bvec(&req->rq_snd_buf, gfp);
-	if (ret < 0)
-		return ret;
-	xdr_free_bvec(&req->rq_rcv_buf);
-	return xdr_alloc_bvec(&req->rq_rcv_buf, gfp);
+	return xdr_alloc_bvec(buf, rpc_task_gfp_mask());
 }
 
 /*
-- 
cgit v1.2.3


From c452d49849d48bd37ae97fc2bc92c6435707c35f Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Mon, 25 Jul 2022 12:24:59 +0300
Subject: mtd: spi-nor: s/addr_width/addr_nbytes

Address width was an unfortunate name, as it means the number of IO lines
used for the address, whereas in the code it is used as the number of
address bytes. s/addr_width/addr_nbytes throughout the entire SPI NOR
framework.

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Acked-by: Pratyush Yadav <p.yadav@ti.com>
Link: https://lore.kernel.org/r/20220725092505.446315-2-tudor.ambarus@microchip.com
---
 drivers/mtd/spi-nor/controllers/hisi-sfc.c  |  2 +-
 drivers/mtd/spi-nor/controllers/nxp-spifi.c |  8 ++---
 drivers/mtd/spi-nor/core.c                  | 54 ++++++++++++++---------------
 drivers/mtd/spi-nor/core.h                  | 12 +++----
 drivers/mtd/spi-nor/debugfs.c               |  2 +-
 drivers/mtd/spi-nor/issi.c                  |  8 ++---
 drivers/mtd/spi-nor/otp.c                   | 12 +++----
 drivers/mtd/spi-nor/sfdp.c                  | 32 ++++++++---------
 drivers/mtd/spi-nor/xilinx.c                |  2 +-
 include/linux/mtd/spi-nor.h                 |  4 +--
 10 files changed, 68 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/controllers/hisi-sfc.c b/drivers/mtd/spi-nor/controllers/hisi-sfc.c
index 94a969185ceb..5070d72835ec 100644
--- a/drivers/mtd/spi-nor/controllers/hisi-sfc.c
+++ b/drivers/mtd/spi-nor/controllers/hisi-sfc.c
@@ -237,7 +237,7 @@ static int hisi_spi_nor_dma_transfer(struct spi_nor *nor, loff_t start_off,
 	reg = readl(host->regbase + FMC_CFG);
 	reg &= ~(FMC_CFG_OP_MODE_MASK | SPI_NOR_ADDR_MODE_MASK);
 	reg |= FMC_CFG_OP_MODE_NORMAL;
-	reg |= (nor->addr_width == 4) ? SPI_NOR_ADDR_MODE_4BYTES
+	reg |= (nor->addr_nbytes == 4) ? SPI_NOR_ADDR_MODE_4BYTES
 		: SPI_NOR_ADDR_MODE_3BYTES;
 	writel(reg, host->regbase + FMC_CFG);
 
diff --git a/drivers/mtd/spi-nor/controllers/nxp-spifi.c b/drivers/mtd/spi-nor/controllers/nxp-spifi.c
index 9032b9ab2eaf..ab3990e6ac25 100644
--- a/drivers/mtd/spi-nor/controllers/nxp-spifi.c
+++ b/drivers/mtd/spi-nor/controllers/nxp-spifi.c
@@ -203,7 +203,7 @@ static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
 	      SPIFI_CMD_DATALEN(len) |
 	      SPIFI_CMD_FIELDFORM_ALL_SERIAL |
 	      SPIFI_CMD_OPCODE(nor->program_opcode) |
-	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 	writel(cmd, spifi->io_base + SPIFI_CMD);
 
 	for (i = 0; i < len; i++)
@@ -230,7 +230,7 @@ static int nxp_spifi_erase(struct spi_nor *nor, loff_t offs)
 
 	cmd = SPIFI_CMD_FIELDFORM_ALL_SERIAL |
 	      SPIFI_CMD_OPCODE(nor->erase_opcode) |
-	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+	      SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 	writel(cmd, spifi->io_base + SPIFI_CMD);
 
 	return nxp_spifi_wait_for_cmd(spifi);
@@ -252,12 +252,12 @@ static int nxp_spifi_setup_memory_cmd(struct nxp_spifi *spifi)
 	}
 
 	/* Memory mode supports address length between 1 and 4 */
-	if (spifi->nor.addr_width < 1 || spifi->nor.addr_width > 4)
+	if (spifi->nor.addr_nbytes < 1 || spifi->nor.addr_nbytes > 4)
 		return -EINVAL;
 
 	spifi->mcmd |= SPIFI_CMD_OPCODE(spifi->nor.read_opcode) |
 		       SPIFI_CMD_INTLEN(spifi->nor.read_dummy / 8) |
-		       SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
+		       SPIFI_CMD_FRAMEFORM(spifi->nor.addr_nbytes + 1);
 
 	return 0;
 }
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index ce5d69317d46..31604188ee59 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -38,7 +38,7 @@
  */
 #define CHIP_ERASE_2MB_READY_WAIT_JIFFIES	(40UL * HZ)
 
-#define SPI_NOR_MAX_ADDR_WIDTH	4
+#define SPI_NOR_MAX_ADDR_NBYTES	4
 
 #define SPI_NOR_SRST_SLEEP_MIN 200
 #define SPI_NOR_SRST_SLEEP_MAX 400
@@ -198,7 +198,7 @@ static ssize_t spi_nor_spimem_read_data(struct spi_nor *nor, loff_t from,
 {
 	struct spi_mem_op op =
 		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
-			   SPI_MEM_OP_ADDR(nor->addr_width, from, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_nbytes, from, 0),
 			   SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
 			   SPI_MEM_OP_DATA_IN(len, buf, 0));
 	bool usebouncebuf;
@@ -262,7 +262,7 @@ static ssize_t spi_nor_spimem_write_data(struct spi_nor *nor, loff_t to,
 {
 	struct spi_mem_op op =
 		SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
-			   SPI_MEM_OP_ADDR(nor->addr_width, to, 0),
+			   SPI_MEM_OP_ADDR(nor->addr_nbytes, to, 0),
 			   SPI_MEM_OP_NO_DUMMY,
 			   SPI_MEM_OP_DATA_OUT(len, buf, 0));
 	ssize_t nbytes;
@@ -1113,7 +1113,7 @@ int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 	if (nor->spimem) {
 		struct spi_mem_op op =
 			SPI_NOR_SECTOR_ERASE_OP(nor->erase_opcode,
-						nor->addr_width, addr);
+						nor->addr_nbytes, addr);
 
 		spi_nor_spimem_setup_op(nor, &op, nor->write_proto);
 
@@ -1126,13 +1126,13 @@ int spi_nor_erase_sector(struct spi_nor *nor, u32 addr)
 	 * Default implementation, if driver doesn't have a specialized HW
 	 * control
 	 */
-	for (i = nor->addr_width - 1; i >= 0; i--) {
+	for (i = nor->addr_nbytes - 1; i >= 0; i--) {
 		nor->bouncebuf[i] = addr & 0xff;
 		addr >>= 8;
 	}
 
 	return spi_nor_controller_ops_write_reg(nor, nor->erase_opcode,
-						nor->bouncebuf, nor->addr_width);
+						nor->bouncebuf, nor->addr_nbytes);
 }
 
 /**
@@ -2249,43 +2249,43 @@ static int spi_nor_default_setup(struct spi_nor *nor,
 	return 0;
 }
 
-static int spi_nor_set_addr_width(struct spi_nor *nor)
+static int spi_nor_set_addr_nbytes(struct spi_nor *nor)
 {
-	if (nor->addr_width) {
+	if (nor->addr_nbytes) {
 		/* already configured from SFDP */
 	} else if (nor->read_proto == SNOR_PROTO_8_8_8_DTR) {
 		/*
 		 * In 8D-8D-8D mode, one byte takes half a cycle to transfer. So
-		 * in this protocol an odd address width cannot be used because
+		 * in this protocol an odd addr_nbytes cannot be used because
 		 * then the address phase would only span a cycle and a half.
 		 * Half a cycle would be left over. We would then have to start
 		 * the dummy phase in the middle of a cycle and so too the data
 		 * phase, and we will end the transaction with half a cycle left
 		 * over.
 		 *
-		 * Force all 8D-8D-8D flashes to use an address width of 4 to
+		 * Force all 8D-8D-8D flashes to use an addr_nbytes of 4 to
 		 * avoid this situation.
 		 */
-		nor->addr_width = 4;
-	} else if (nor->info->addr_width) {
-		nor->addr_width = nor->info->addr_width;
+		nor->addr_nbytes = 4;
+	} else if (nor->info->addr_nbytes) {
+		nor->addr_nbytes = nor->info->addr_nbytes;
 	} else {
-		nor->addr_width = 3;
+		nor->addr_nbytes = 3;
 	}
 
-	if (nor->addr_width == 3 && nor->params->size > 0x1000000) {
+	if (nor->addr_nbytes == 3 && nor->params->size > 0x1000000) {
 		/* enable 4-byte addressing if the device exceeds 16MiB */
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 	}
 
-	if (nor->addr_width > SPI_NOR_MAX_ADDR_WIDTH) {
-		dev_dbg(nor->dev, "address width is too large: %u\n",
-			nor->addr_width);
+	if (nor->addr_nbytes > SPI_NOR_MAX_ADDR_NBYTES) {
+		dev_dbg(nor->dev, "The number of address bytes is too large: %u\n",
+			nor->addr_nbytes);
 		return -EINVAL;
 	}
 
 	/* Set 4byte opcodes when possible. */
-	if (nor->addr_width == 4 && nor->flags & SNOR_F_4B_OPCODES &&
+	if (nor->addr_nbytes == 4 && nor->flags & SNOR_F_4B_OPCODES &&
 	    !(nor->flags & SNOR_F_HAS_4BAIT))
 		spi_nor_set_4byte_opcodes(nor);
 
@@ -2304,7 +2304,7 @@ static int spi_nor_setup(struct spi_nor *nor,
 	if (ret)
 		return ret;
 
-	return spi_nor_set_addr_width(nor);
+	return spi_nor_set_addr_nbytes(nor);
 }
 
 /**
@@ -2492,7 +2492,7 @@ static void spi_nor_sfdp_init_params_deprecated(struct spi_nor *nor)
 
 	if (spi_nor_parse_sfdp(nor)) {
 		memcpy(nor->params, &sfdp_params, sizeof(*nor->params));
-		nor->addr_width = 0;
+		nor->addr_nbytes = 0;
 		nor->flags &= ~SNOR_F_4B_OPCODES;
 	}
 }
@@ -2713,7 +2713,7 @@ static int spi_nor_init(struct spi_nor *nor)
 	     nor->flags & SNOR_F_SWP_IS_VOLATILE))
 		spi_nor_try_unlock_all(nor);
 
-	if (nor->addr_width == 4 &&
+	if (nor->addr_nbytes == 4 &&
 	    nor->read_proto != SNOR_PROTO_8_8_8_DTR &&
 	    !(nor->flags & SNOR_F_4B_OPCODES)) {
 		/*
@@ -2840,7 +2840,7 @@ static void spi_nor_put_device(struct mtd_info *mtd)
 void spi_nor_restore(struct spi_nor *nor)
 {
 	/* restore the addressing mode */
-	if (nor->addr_width == 4 && !(nor->flags & SNOR_F_4B_OPCODES) &&
+	if (nor->addr_nbytes == 4 && !(nor->flags & SNOR_F_4B_OPCODES) &&
 	    nor->flags & SNOR_F_BROKEN_RESET)
 		nor->params->set_4byte_addr_mode(nor, false);
 
@@ -2984,7 +2984,7 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 	 * - select op codes for (Fast) Read, Page Program and Sector Erase.
 	 * - set the number of dummy cycles (mode cycles + wait states).
 	 * - set the SPI protocols for register and memory accesses.
-	 * - set the address width.
+	 * - set the number of address bytes.
 	 */
 	ret = spi_nor_setup(nor, hwcaps);
 	if (ret)
@@ -3025,7 +3025,7 @@ static int spi_nor_create_read_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
 		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->read_opcode, 0),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_nbytes, 0, 0),
 				      SPI_MEM_OP_DUMMY(nor->read_dummy, 0),
 				      SPI_MEM_OP_DATA_IN(0, NULL, 0)),
 		.offset = 0,
@@ -3056,7 +3056,7 @@ static int spi_nor_create_write_dirmap(struct spi_nor *nor)
 {
 	struct spi_mem_dirmap_info info = {
 		.op_tmpl = SPI_MEM_OP(SPI_MEM_OP_CMD(nor->program_opcode, 0),
-				      SPI_MEM_OP_ADDR(nor->addr_width, 0, 0),
+				      SPI_MEM_OP_ADDR(nor->addr_nbytes, 0, 0),
 				      SPI_MEM_OP_NO_DUMMY,
 				      SPI_MEM_OP_DATA_OUT(0, NULL, 0)),
 		.offset = 0,
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index 61886868cd02..14921a507b0f 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -84,9 +84,9 @@
 		   SPI_MEM_OP_NO_DUMMY,					\
 		   SPI_MEM_OP_NO_DATA)
 
-#define SPI_NOR_SECTOR_ERASE_OP(opcode, addr_width, addr)		\
+#define SPI_NOR_SECTOR_ERASE_OP(opcode, addr_nbytes, addr)		\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 0),				\
-		   SPI_MEM_OP_ADDR(addr_width, addr, 0),		\
+		   SPI_MEM_OP_ADDR(addr_nbytes, addr, 0),		\
 		   SPI_MEM_OP_NO_DUMMY,					\
 		   SPI_MEM_OP_NO_DATA)
 
@@ -429,7 +429,7 @@ struct spi_nor_fixups {
  *                  isn't necessarily called a "sector" by the vendor.
  * @n_sectors:      the number of sectors.
  * @page_size:      the flash's page size.
- * @addr_width:     the flash's address width.
+ * @addr_nbytes:    number of address bytes to send.
  *
  * @parse_sfdp:     true when flash supports SFDP tables. The false value has no
  *                  meaning. If one wants to skip the SFDP tables, one should
@@ -487,7 +487,7 @@ struct flash_info {
 	unsigned sector_size;
 	u16 n_sectors;
 	u16 page_size;
-	u16 addr_width;
+	u16 addr_nbytes;
 
 	bool parse_sfdp;
 	u16 flags;
@@ -548,11 +548,11 @@ struct flash_info {
 		.n_sectors = (_n_sectors),				\
 		.page_size = 256,					\
 
-#define CAT25_INFO(_sector_size, _n_sectors, _page_size, _addr_width)	\
+#define CAT25_INFO(_sector_size, _n_sectors, _page_size, _addr_nbytes)	\
 		.sector_size = (_sector_size),				\
 		.n_sectors = (_n_sectors),				\
 		.page_size = (_page_size),				\
-		.addr_width = (_addr_width),				\
+		.addr_nbytes = (_addr_nbytes),				\
 		.flags = SPI_NOR_NO_ERASE | SPI_NOR_NO_FR,		\
 
 #define OTP_INFO(_len, _n_regions, _base, _offset)			\
diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c
index eaf84f7a0676..df76cb5de3f9 100644
--- a/drivers/mtd/spi-nor/debugfs.c
+++ b/drivers/mtd/spi-nor/debugfs.c
@@ -86,7 +86,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data)
 	seq_printf(s, "size\t\t%s\n", buf);
 	seq_printf(s, "write size\t%u\n", params->writesize);
 	seq_printf(s, "page size\t%u\n", params->page_size);
-	seq_printf(s, "address width\t%u\n", nor->addr_width);
+	seq_printf(s, "address nbytes\t%u\n", nor->addr_nbytes);
 
 	seq_puts(s, "flags\t\t");
 	spi_nor_print_flags(s, nor->flags, snor_f_names, sizeof(snor_f_names));
diff --git a/drivers/mtd/spi-nor/issi.c b/drivers/mtd/spi-nor/issi.c
index 3c7d51d2b050..71687e5babdc 100644
--- a/drivers/mtd/spi-nor/issi.c
+++ b/drivers/mtd/spi-nor/issi.c
@@ -14,13 +14,13 @@ is25lp256_post_bfpt_fixups(struct spi_nor *nor,
 			   const struct sfdp_bfpt *bfpt)
 {
 	/*
-	 * IS25LP256 supports 4B opcodes, but the BFPT advertises a
-	 * BFPT_DWORD1_ADDRESS_BYTES_3_ONLY address width.
-	 * Overwrite the address width advertised by the BFPT.
+	 * IS25LP256 supports 4B opcodes, but the BFPT advertises
+	 * BFPT_DWORD1_ADDRESS_BYTES_3_ONLY.
+	 * Overwrite the number of address bytes advertised by the BFPT.
 	 */
 	if ((bfpt->dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK) ==
 		BFPT_DWORD1_ADDRESS_BYTES_3_ONLY)
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 
 	return 0;
 }
diff --git a/drivers/mtd/spi-nor/otp.c b/drivers/mtd/spi-nor/otp.c
index fa63d8571218..00ab0d2d6d2f 100644
--- a/drivers/mtd/spi-nor/otp.c
+++ b/drivers/mtd/spi-nor/otp.c
@@ -35,13 +35,13 @@
  */
 int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
 {
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	struct spi_mem_dirmap_desc *rdesc;
 	enum spi_nor_protocol read_proto;
 	int ret;
 
 	read_opcode = nor->read_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 	read_proto = nor->read_proto;
 	rdesc = nor->dirmap.rdesc;
@@ -54,7 +54,7 @@ int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
 	ret = spi_nor_read_data(nor, addr, len, buf);
 
 	nor->read_opcode = read_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 	nor->read_proto = read_proto;
 	nor->dirmap.rdesc = rdesc;
@@ -85,11 +85,11 @@ int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
 {
 	enum spi_nor_protocol write_proto;
 	struct spi_mem_dirmap_desc *wdesc;
-	u8 addr_width, program_opcode;
+	u8 addr_nbytes, program_opcode;
 	int ret, written;
 
 	program_opcode = nor->program_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	write_proto = nor->write_proto;
 	wdesc = nor->dirmap.wdesc;
 
@@ -113,7 +113,7 @@ int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
 
 out:
 	nor->program_opcode = program_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->write_proto = write_proto;
 	nor->dirmap.wdesc = wdesc;
 
diff --git a/drivers/mtd/spi-nor/sfdp.c b/drivers/mtd/spi-nor/sfdp.c
index a5211543d30d..61ae8c8c5237 100644
--- a/drivers/mtd/spi-nor/sfdp.c
+++ b/drivers/mtd/spi-nor/sfdp.c
@@ -134,7 +134,7 @@ struct sfdp_4bait {
 
 /**
  * spi_nor_read_raw() - raw read of serial flash memory. read_opcode,
- *			addr_width and read_dummy members of the struct spi_nor
+ *			addr_nbytes and read_dummy members of the struct spi_nor
  *			should be previously
  * set.
  * @nor:	pointer to a 'struct spi_nor'
@@ -178,21 +178,21 @@ static int spi_nor_read_raw(struct spi_nor *nor, u32 addr, size_t len, u8 *buf)
 static int spi_nor_read_sfdp(struct spi_nor *nor, u32 addr,
 			     size_t len, void *buf)
 {
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	int ret;
 
 	read_opcode = nor->read_opcode;
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 
 	nor->read_opcode = SPINOR_OP_RDSFDP;
-	nor->addr_width = 3;
+	nor->addr_nbytes = 3;
 	nor->read_dummy = 8;
 
 	ret = spi_nor_read_raw(nor, addr, len, buf);
 
 	nor->read_opcode = read_opcode;
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 
 	return ret;
@@ -462,11 +462,11 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor,
 	switch (bfpt.dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK) {
 	case BFPT_DWORD1_ADDRESS_BYTES_3_ONLY:
 	case BFPT_DWORD1_ADDRESS_BYTES_3_OR_4:
-		nor->addr_width = 3;
+		nor->addr_nbytes = 3;
 		break;
 
 	case BFPT_DWORD1_ADDRESS_BYTES_4_ONLY:
-		nor->addr_width = 4;
+		nor->addr_nbytes = 4;
 		break;
 
 	default:
@@ -637,12 +637,12 @@ static int spi_nor_parse_bfpt(struct spi_nor *nor,
 }
 
 /**
- * spi_nor_smpt_addr_width() - return the address width used in the
+ * spi_nor_smpt_addr_nbytes() - return the number of address bytes used in the
  *			       configuration detection command.
  * @nor:	pointer to a 'struct spi_nor'
  * @settings:	configuration detection command descriptor, dword1
  */
-static u8 spi_nor_smpt_addr_width(const struct spi_nor *nor, const u32 settings)
+static u8 spi_nor_smpt_addr_nbytes(const struct spi_nor *nor, const u32 settings)
 {
 	switch (settings & SMPT_CMD_ADDRESS_LEN_MASK) {
 	case SMPT_CMD_ADDRESS_LEN_0:
@@ -653,7 +653,7 @@ static u8 spi_nor_smpt_addr_width(const struct spi_nor *nor, const u32 settings)
 		return 4;
 	case SMPT_CMD_ADDRESS_LEN_USE_CURRENT:
 	default:
-		return nor->addr_width;
+		return nor->addr_nbytes;
 	}
 }
 
@@ -690,7 +690,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	u32 addr;
 	int err;
 	u8 i;
-	u8 addr_width, read_opcode, read_dummy;
+	u8 addr_nbytes, read_opcode, read_dummy;
 	u8 read_data_mask, map_id;
 
 	/* Use a kmalloc'ed bounce buffer to guarantee it is DMA-able. */
@@ -698,7 +698,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
-	addr_width = nor->addr_width;
+	addr_nbytes = nor->addr_nbytes;
 	read_dummy = nor->read_dummy;
 	read_opcode = nor->read_opcode;
 
@@ -709,7 +709,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 			break;
 
 		read_data_mask = SMPT_CMD_READ_DATA(smpt[i]);
-		nor->addr_width = spi_nor_smpt_addr_width(nor, smpt[i]);
+		nor->addr_nbytes = spi_nor_smpt_addr_nbytes(nor, smpt[i]);
 		nor->read_dummy = spi_nor_smpt_read_dummy(nor, smpt[i]);
 		nor->read_opcode = SMPT_CMD_OPCODE(smpt[i]);
 		addr = smpt[i + 1];
@@ -756,7 +756,7 @@ static const u32 *spi_nor_get_map_in_use(struct spi_nor *nor, const u32 *smpt,
 	/* fall through */
 out:
 	kfree(buf);
-	nor->addr_width = addr_width;
+	nor->addr_nbytes = addr_nbytes;
 	nor->read_dummy = read_dummy;
 	nor->read_opcode = read_opcode;
 	return ret;
@@ -1044,7 +1044,7 @@ static int spi_nor_parse_4bait(struct spi_nor *nor,
 	/*
 	 * We need at least one 4-byte op code per read, program and erase
 	 * operation; the .read(), .write() and .erase() hooks share the
-	 * nor->addr_width value.
+	 * nor->addr_nbytes value.
 	 */
 	if (!read_hwcaps || !pp_hwcaps || !erase_mask)
 		goto out;
@@ -1098,7 +1098,7 @@ static int spi_nor_parse_4bait(struct spi_nor *nor,
 	 * Spansion memory. However this quirk is no longer needed with new
 	 * SFDP compliant memories.
 	 */
-	nor->addr_width = 4;
+	nor->addr_nbytes = 4;
 	nor->flags |= SNOR_F_4B_OPCODES | SNOR_F_HAS_4BAIT;
 
 	/* fall through */
diff --git a/drivers/mtd/spi-nor/xilinx.c b/drivers/mtd/spi-nor/xilinx.c
index 1d2f5db047bd..5723157739fc 100644
--- a/drivers/mtd/spi-nor/xilinx.c
+++ b/drivers/mtd/spi-nor/xilinx.c
@@ -31,7 +31,7 @@
 		.sector_size = (8 * (_page_size)),			\
 		.n_sectors = (_n_sectors),				\
 		.page_size = (_page_size),				\
-		.addr_width = 3,					\
+		.addr_nbytes = 3,					\
 		.flags = SPI_NOR_NO_FR
 
 /* Xilinx S3AN share MFR with Atmel SPI NOR */
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 1ede4c89805a..42218a1164f6 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -351,7 +351,7 @@ struct spi_nor_flash_parameter;
  * @bouncebuf_size:	size of the bounce buffer
  * @info:		SPI NOR part JEDEC MFR ID and other info
  * @manufacturer:	SPI NOR manufacturer
- * @addr_width:		number of address bytes
+ * @addr_nbytes:	number of address bytes
  * @erase_opcode:	the opcode for erasing a sector
  * @read_opcode:	the read opcode
  * @read_dummy:		the dummy needed by the read operation
@@ -381,7 +381,7 @@ struct spi_nor {
 	size_t			bouncebuf_size;
 	const struct flash_info	*info;
 	const struct spi_nor_manufacturer *manufacturer;
-	u8			addr_width;
+	u8			addr_nbytes;
 	u8			erase_opcode;
 	u8			read_opcode;
 	u8			read_dummy;
-- 
cgit v1.2.3


From e60a7233684aa8bbe9090537720fe6a1e901d823 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 28 Jul 2022 08:10:51 +0200
Subject: Documentation: serial: move uart_ops documentation to the struct

While it's a lot of text, it always helps to keep it up to date when
it's by the source. (And not in a separate file.)

The documentation tooling also makes sure that all members of the
structure are documented. (If not, it complains loudly.)

Finally, there needs to be no comments inlined in the structure, so they
are dropped as they are superfluous now.

The compilation time of this header (tested with serial_core.c) didn't
change in my testing at all.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20220728061056.20799-1-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst | 358 +----------------------------
 include/linux/serial_core.h                | 345 +++++++++++++++++++++++++--
 2 files changed, 331 insertions(+), 372 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index ee1679858aa2..cb0ec6db4f1e 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -63,362 +63,8 @@ commonly referred to as the port mutex.
 uart_ops
 --------
 
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		ADDRB
-			- address bit (changed through .rs485_config()).
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See ../tty/tty_ldisc.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
+.. kernel-doc:: include/linux/serial_core.h
+   :identifiers: uart_ops
 
 Other functions
 ---------------
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index a6fa7c40c330..35d2f9e8027d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -31,9 +31,336 @@ struct serial_struct;
 struct device;
 struct gpio_desc;
 
-/*
+/**
+ * struct uart_ops -- interface between serial_core and the driver
+ *
  * This structure describes all the operations that can be done on the
- * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
+ * physical hardware.
+ *
+ * @tx_empty: ``unsigned int ()(struct uart_port *port)``
+ *
+ *	This function tests whether the transmitter fifo and shifter for the
+ *	@port is empty. If it is empty, this function should return
+ *	%TIOCSER_TEMT, otherwise return 0. If the port does not support this
+ *	operation, then it should return %TIOCSER_TEMT.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)``
+ *
+ *	This function sets the modem control lines for @port to the state
+ *	described by @mctrl. The relevant bits of @mctrl are:
+ *
+ *		- %TIOCM_RTS	RTS signal.
+ *		- %TIOCM_DTR	DTR signal.
+ *		- %TIOCM_OUT1	OUT1 signal.
+ *		- %TIOCM_OUT2	OUT2 signal.
+ *		- %TIOCM_LOOP	Set the port into loopback mode.
+ *
+ *	If the appropriate bit is set, the signal should be driven
+ *	active.  If the bit is clear, the signal should be driven
+ *	inactive.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @get_mctrl: ``unsigned int ()(struct uart_port *port)``
+ *
+ *	Returns the current state of modem control inputs of @port. The state
+ *	of the outputs should not be returned, since the core keeps track of
+ *	their state. The state information should include:
+ *
+ *		- %TIOCM_CAR	state of DCD signal
+ *		- %TIOCM_CTS	state of CTS signal
+ *		- %TIOCM_DSR	state of DSR signal
+ *		- %TIOCM_RI	state of RI signal
+ *
+ *	The bit is set if the signal is currently driven active.  If
+ *	the port does not support CTS, DCD or DSR, the driver should
+ *	indicate that the signal is permanently active. If RI is
+ *	not available, the signal should not be indicated as active.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @stop_tx: ``void ()(struct uart_port *port)``
+ *
+ *	Stop transmitting characters. This might be due to the CTS line
+ *	becoming inactive or the tty layer indicating we want to stop
+ *	transmission due to an %XOFF character.
+ *
+ *	The driver should stop transmitting characters as soon as possible.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @start_tx: ``void ()(struct uart_port *port)``
+ *
+ *	Start transmitting characters.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @throttle: ``void ()(struct uart_port *port)``
+ *
+ *	Notify the serial driver that input buffers for the line discipline are
+ *	close to full, and it should somehow signal that no more characters
+ *	should be sent to the serial port.
+ *	This will be called only if hardware assisted flow control is enabled.
+ *
+ *	Locking: serialized with @unthrottle() and termios modification by the
+ *	tty layer.
+ *
+ * @unthrottle: ``void ()(struct uart_port *port)``
+ *
+ *	Notify the serial driver that characters can now be sent to the serial
+ *	port without fear of overrunning the input buffers of the line
+ *	disciplines.
+ *
+ *	This will be called only if hardware assisted flow control is enabled.
+ *
+ *	Locking: serialized with @throttle() and termios modification by the
+ *	tty layer.
+ *
+ * @send_xchar: ``void ()(struct uart_port *port, char ch)``
+ *
+ *	Transmit a high priority character, even if the port is stopped. This
+ *	is used to implement XON/XOFF flow control and tcflow(). If the serial
+ *	driver does not implement this function, the tty core will append the
+ *	character to the circular buffer and then call start_tx() / stop_tx()
+ *	to flush the data out.
+ *
+ *	Do not transmit if @ch == '\0' (%__DISABLED_CHAR).
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @stop_rx: ``void ()(struct uart_port *port)``
+ *
+ *	Stop receiving characters; the @port is in the process of being closed.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @enable_ms: ``void ()(struct uart_port *port)``
+ *
+ *	Enable the modem status interrupts.
+ *
+ *	This method may be called multiple times. Modem status interrupts
+ *	should be disabled when the @shutdown() method is called.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @break_ctl: ``void ()(struct uart_port *port, int ctl)``
+ *
+ *	Control the transmission of a break signal. If @ctl is nonzero, the
+ *	break signal should be transmitted. The signal should be terminated
+ *	when another call is made with a zero @ctl.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *
+ * @startup: ``int ()(struct uart_port *port)``
+ *
+ *	Grab any interrupt resources and initialise any low level driver state.
+ *	Enable the port for reception. It should not activate RTS nor DTR;
+ *	this will be done via a separate call to @set_mctrl().
+ *
+ *	This method will only be called when the port is initially opened.
+ *
+ *	Locking: port_sem taken.
+ *	Interrupts: globally disabled.
+ *
+ * @shutdown: ``void ()(struct uart_port *port)``
+ *
+ *	Disable the @port, disable any break condition that may be in effect,
+ *	and free any interrupt resources. It should not disable RTS nor DTR;
+ *	this will have already been done via a separate call to @set_mctrl().
+ *
+ *	Drivers must not access @port->state once this call has completed.
+ *
+ *	This method will only be called when there are no more users of this
+ *	@port.
+ *
+ *	Locking: port_sem taken.
+ *	Interrupts: caller dependent.
+ *
+ * @flush_buffer: ``void ()(struct uart_port *port)``
+ *
+ *	Flush any write buffers, reset any DMA state and stop any ongoing DMA
+ *	transfers.
+ *
+ *	This will be called whenever the @port->state->xmit circular buffer is
+ *	cleared.
+ *
+ *	Locking: @port->lock taken.
+ *	Interrupts: locally disabled.
+ *	This call must not sleep
+ *
+ * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new,
+ *			struct ktermios *old)``
+ *
+ *	Change the @port parameters, including word length, parity, stop bits.
+ *	Update @port->read_status_mask and @port->ignore_status_mask to
+ *	indicate the types of events we are interested in receiving. Relevant
+ *	ktermios::c_cflag bits are:
+ *
+ *	- %CSIZE - word size
+ *	- %CSTOPB - 2 stop bits
+ *	- %PARENB - parity enable
+ *	- %PARODD - odd parity (when %PARENB is in force)
+ *	- %ADDRB - address bit (changed through uart_port::rs485_config()).
+ *	- %CREAD - enable reception of characters (if not set, still receive
+ *	  characters from the port, but throw them away).
+ *	- %CRTSCTS - if set, enable CTS status change reporting.
+ *	- %CLOCAL - if not set, enable modem status change reporting.
+ *
+ *	Relevant ktermios::c_iflag bits are:
+ *
+ *	- %INPCK - enable frame and parity error events to be passed to the TTY
+ *	  layer.
+ *	- %BRKINT / %PARMRK - both of these enable break events to be passed to
+ *	  the TTY layer.
+ *	- %IGNPAR - ignore parity and framing errors.
+ *	- %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun
+ *	  errors as well.
+ *
+ *	The interaction of the ktermios::c_iflag bits is as follows (parity
+ *	error given as an example):
+ *
+ *	============ ======= ======= =========================================
+ *	Parity error INPCK   IGNPAR
+ *	============ ======= ======= =========================================
+ *	n/a	     0	     n/a     character received, marked as %TTY_NORMAL
+ *	None	     1	     n/a     character received, marked as %TTY_NORMAL
+ *	Yes	     1	     0	     character received, marked as %TTY_PARITY
+ *	Yes	     1	     1	     character discarded
+ *	============ ======= ======= =========================================
+ *
+ *	Other flags may be used (eg, xon/xoff characters) if your hardware
+ *	supports hardware "soft" flow control.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)``
+ *
+ *	Notifier for discipline change. See
+ *	Documentation/driver-api/tty/tty_ldisc.rst.
+ *
+ *	Locking: caller holds tty_port->mutex
+ *
+ * @pm: ``void ()(struct uart_port *port, unsigned int state,
+ *		 unsigned int oldstate)``
+ *
+ *	Perform any power management related activities on the specified @port.
+ *	@state indicates the new state (defined by enum uart_pm_state),
+ *	@oldstate indicates the previous state.
+ *
+ *	This function should not be used to grab any resources.
+ *
+ *	This will be called when the @port is initially opened and finally
+ *	closed, except when the @port is also the system console. This will
+ *	occur even if %CONFIG_PM is not set.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @type: ``const char *()(struct uart_port *port)``
+ *
+ *	Return a pointer to a string constant describing the specified @port,
+ *	or return %NULL, in which case the string 'unknown' is substituted.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @release_port: ``void ()(struct uart_port *port)``
+ *
+ *	Release any memory and IO region resources currently in use by the
+ *	@port.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @request_port: ``int ()(struct uart_port *port)``
+ *
+ *	Request any memory and IO region resources required by the port. If any
+ *	fail, no resources should be registered when this function returns, and
+ *	it should return -%EBUSY on failure.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @config_port: ``void ()(struct uart_port *port, int type)``
+ *
+ *	Perform any autoconfiguration steps required for the @port. @type
+ *	contains a bit mask of the required configuration. %UART_CONFIG_TYPE
+ *	indicates that the port requires detection and identification.
+ *	@port->type should be set to the type found, or %PORT_UNKNOWN if no
+ *	port was detected.
+ *
+ *	%UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+ *	which should be probed using standard kernel autoprobing techniques.
+ *	This is not necessary on platforms where ports have interrupts
+ *	internally hard wired (eg, system on a chip implementations).
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @verify_port: ``int ()(struct uart_port *port,
+ *			struct serial_struct *serinfo)``
+ *
+ *	Verify the new serial port information contained within @serinfo is
+ *	suitable for this port type.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd,
+ *		unsigned long arg)``
+ *
+ *	Perform any port specific IOCTLs. IOCTL commands must be defined using
+ *	the standard numbering system found in <asm/ioctl.h>.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *
+ * @poll_init: ``int ()(struct uart_port *port)``
+ *
+ *	Called by kgdb to perform the minimal hardware initialization needed to
+ *	support @poll_put_char() and @poll_get_char(). Unlike @startup(), this
+ *	should not request interrupts.
+ *
+ *	Locking: %tty_mutex and tty_port->mutex taken.
+ *	Interrupts: n/a.
+ *
+ * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)``
+ *
+ *	Called by kgdb to write a single character @ch directly to the serial
+ *	@port. It can and should block until there is space in the TX FIFO.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
+ *
+ * @poll_get_char: ``int ()(struct uart_port *port)``
+ *
+ *	Called by kgdb to read a single character directly from the serial
+ *	port. If data is available, it should be returned; otherwise the
+ *	function should return %NO_POLL_CHAR immediately.
+ *
+ *	Locking: none.
+ *	Interrupts: caller dependent.
+ *	This call must not sleep
  */
 struct uart_ops {
 	unsigned int	(*tx_empty)(struct uart_port *);
@@ -56,22 +383,8 @@ struct uart_ops {
 	void		(*set_ldisc)(struct uart_port *, struct ktermios *);
 	void		(*pm)(struct uart_port *, unsigned int state,
 			      unsigned int oldstate);
-
-	/*
-	 * Return a string describing the type of the port
-	 */
 	const char	*(*type)(struct uart_port *);
-
-	/*
-	 * Release IO and memory resources used by the port.
-	 * This includes iounmap if necessary.
-	 */
 	void		(*release_port)(struct uart_port *);
-
-	/*
-	 * Request IO and memory resources used by the port.
-	 * This includes iomapping the port if necessary.
-	 */
 	int		(*request_port)(struct uart_port *);
 	void		(*config_port)(struct uart_port *, int);
 	int		(*verify_port)(struct uart_port *, struct serial_struct *);
-- 
cgit v1.2.3


From 6dd71251b9aeedd540fd7003bc5f73d59dd6dcb2 Mon Sep 17 00:00:00 2001
From: Xin Gao <gaoxin@cdjrlc.com>
Date: Fri, 22 Jul 2022 10:23:37 +0800
Subject: platform/x86: pmc_atom: Fix comment typo

The double `of' is duplicated in line 50, remove one.

Signed-off-by: Xin Gao <gaoxin@cdjrlc.com>
Link: https://lore.kernel.org/r/20220722022337.15903-1-gaoxin@cdjrlc.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/platform_data/x86/pmc_atom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/x86/pmc_atom.h b/include/linux/platform_data/x86/pmc_atom.h
index 6807839c718b..3edfb6d4e67a 100644
--- a/include/linux/platform_data/x86/pmc_atom.h
+++ b/include/linux/platform_data/x86/pmc_atom.h
@@ -47,7 +47,7 @@
 #define	PMC_S0I2_TMR		0x88
 #define	PMC_S0I3_TMR		0x8C
 #define	PMC_S0_TMR		0x90
-/* Sleep state counter is in units of of 32us */
+/* Sleep state counter is in units of 32us */
 #define	PMC_TMR_SHIFT		5
 
 /* Power status of power islands */
-- 
cgit v1.2.3


From 0fcb100d50835d6823723ef0898cd565b3796e0a Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 22 Jul 2022 09:38:21 +0000
Subject: dm bufio: Add flags argument to dm_bufio_client_create

Add a flags argument to dm_bufio_client_create and update all the
callers. This is in preparation to add the DM_BUFIO_NO_SLEEP flag.

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c                         | 3 ++-
 drivers/md/dm-ebs-target.c                    | 3 ++-
 drivers/md/dm-integrity.c                     | 2 +-
 drivers/md/dm-snap-persistent.c               | 2 +-
 drivers/md/dm-verity-fec.c                    | 4 ++--
 drivers/md/dm-verity-target.c                 | 2 +-
 drivers/md/persistent-data/dm-block-manager.c | 3 ++-
 include/linux/dm-bufio.h                      | 3 ++-
 8 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5ffa1dcf84cf..ad5603eb12e3 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1717,7 +1717,8 @@ static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrin
 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 					       unsigned reserved_buffers, unsigned aux_size,
 					       void (*alloc_callback)(struct dm_buffer *),
-					       void (*write_callback)(struct dm_buffer *))
+					       void (*write_callback)(struct dm_buffer *),
+					       unsigned int flags)
 {
 	int r;
 	struct dm_bufio_client *c;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 0221fa63f888..04a1f56d6588 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -312,7 +312,8 @@ static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL);
+	ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1,
+					   0, NULL, NULL, 0);
 	if (IS_ERR(ec->bufio)) {
 		ti->error = "Cannot create dm bufio client";
 		r = PTR_ERR(ec->bufio);
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3d5a0ce123c9..a508073d8414 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -4439,7 +4439,7 @@ try_smaller_buffer:
 	}
 
 	ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
-			1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL);
+			1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL, 0);
 	if (IS_ERR(ic->bufio)) {
 		r = PTR_ERR(ic->bufio);
 		ti->error = "Cannot initialize dm-bufio";
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3bb5cff5d6fc..aaa699749c3b 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -494,7 +494,7 @@ static int read_exceptions(struct pstore *ps,
 
 	client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
 					ps->store->chunk_size << SECTOR_SHIFT,
-					1, 0, NULL, NULL);
+					1, 0, NULL, NULL, 0);
 
 	if (IS_ERR(client))
 		return PTR_ERR(client);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index cea2b3789736..23cffce56403 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -749,7 +749,7 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	f->bufio = dm_bufio_client_create(f->dev->bdev,
 					  f->io_size,
-					  1, 0, NULL, NULL);
+					  1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->bufio)) {
 		ti->error = "Cannot initialize FEC bufio client";
 		return PTR_ERR(f->bufio);
@@ -765,7 +765,7 @@ int verity_fec_ctr(struct dm_verity *v)
 
 	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
 					       1 << v->data_dev_block_bits,
-					       1, 0, NULL, NULL);
+					       1, 0, NULL, NULL, 0);
 	if (IS_ERR(f->data_bufio)) {
 		ti->error = "Cannot initialize FEC data bufio client";
 		return PTR_ERR(f->data_bufio);
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 75b66dd67633..95db3a7ee3c7 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1265,7 +1265,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
 		1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
-		dm_bufio_alloc_callback, NULL);
+		dm_bufio_alloc_callback, NULL, 0);
 	if (IS_ERR(v->bufio)) {
 		ti->error = "Cannot initialize dm-bufio";
 		r = PTR_ERR(v->bufio);
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 54c089a50b15..11935864f50f 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -391,7 +391,8 @@ struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 	bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread,
 					   sizeof(struct buffer_aux),
 					   dm_block_manager_alloc_callback,
-					   dm_block_manager_write_callback);
+					   dm_block_manager_write_callback,
+					   0);
 	if (IS_ERR(bm->bufio)) {
 		r = PTR_ERR(bm->bufio);
 		kfree(bm);
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index 90bd558a17f5..e21480715255 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -24,7 +24,8 @@ struct dm_bufio_client *
 dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 		       unsigned reserved_buffers, unsigned aux_size,
 		       void (*alloc_callback)(struct dm_buffer *),
-		       void (*write_callback)(struct dm_buffer *));
+		       void (*write_callback)(struct dm_buffer *),
+		       unsigned int flags);
 
 /*
  * Release a buffered IO cache.
-- 
cgit v1.2.3


From b32d45824aa7e07a0c3257a16e3a2a691b11b39a Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Fri, 22 Jul 2022 09:38:22 +0000
Subject: dm bufio: Add DM_BUFIO_CLIENT_NO_SLEEP flag

Add an optional flag that ensures dm_bufio_client does not sleep
(primary focus is to service dm_bufio_get without sleeping). This
allows the dm-bufio cache to be queried from interrupt context.

To ensure that dm-bufio does not sleep, dm-bufio must use a spinlock
instead of a mutex. Additionally, to avoid deadlocks, special care
must be taken so that dm-bufio does not sleep while holding the
spinlock.

But again: the scope of this no_sleep is initially confined to
dm_bufio_get, so __alloc_buffer_wait_no_callback is _not_ changed to
avoid sleeping because __bufio_new avoids allocation for NF_GET.

Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-bufio.c    | 22 +++++++++++++++++++---
 include/linux/dm-bufio.h |  5 +++++
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index ad5603eb12e3..486179d1831c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -81,6 +81,8 @@
  */
 struct dm_bufio_client {
 	struct mutex lock;
+	spinlock_t spinlock;
+	unsigned long spinlock_flags;
 
 	struct list_head lru[LIST_SIZE];
 	unsigned long n_buffers[LIST_SIZE];
@@ -90,6 +92,7 @@ struct dm_bufio_client {
 	s8 sectors_per_block_bits;
 	void (*alloc_callback)(struct dm_buffer *);
 	void (*write_callback)(struct dm_buffer *);
+	bool no_sleep;
 
 	struct kmem_cache *slab_buffer;
 	struct kmem_cache *slab_cache;
@@ -167,17 +170,26 @@ struct dm_buffer {
 
 static void dm_bufio_lock(struct dm_bufio_client *c)
 {
-	mutex_lock_nested(&c->lock, dm_bufio_in_request());
+	if (c->no_sleep)
+		spin_lock_irqsave_nested(&c->spinlock, c->spinlock_flags, dm_bufio_in_request());
+	else
+		mutex_lock_nested(&c->lock, dm_bufio_in_request());
 }
 
 static int dm_bufio_trylock(struct dm_bufio_client *c)
 {
-	return mutex_trylock(&c->lock);
+	if (c->no_sleep)
+		return spin_trylock_irqsave(&c->spinlock, c->spinlock_flags);
+	else
+		return mutex_trylock(&c->lock);
 }
 
 static void dm_bufio_unlock(struct dm_bufio_client *c)
 {
-	mutex_unlock(&c->lock);
+	if (c->no_sleep)
+		spin_unlock_irqrestore(&c->spinlock, c->spinlock_flags);
+	else
+		mutex_unlock(&c->lock);
 }
 
 /*----------------------------------------------------------------*/
@@ -1748,12 +1760,16 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	c->alloc_callback = alloc_callback;
 	c->write_callback = write_callback;
 
+	if (flags & DM_BUFIO_CLIENT_NO_SLEEP)
+		c->no_sleep = true;
+
 	for (i = 0; i < LIST_SIZE; i++) {
 		INIT_LIST_HEAD(&c->lru[i]);
 		c->n_buffers[i] = 0;
 	}
 
 	mutex_init(&c->lock);
+	spin_lock_init(&c->spinlock);
 	INIT_LIST_HEAD(&c->reserved_buffers);
 	c->need_reserved_buffers = reserved_buffers;
 
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index e21480715255..15d9e15ca830 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -17,6 +17,11 @@
 struct dm_bufio_client;
 struct dm_buffer;
 
+/*
+ * Flags for dm_bufio_client_create
+ */
+#define DM_BUFIO_CLIENT_NO_SLEEP 0x1
+
 /*
  * Create a buffered IO cache on a given device
  */
-- 
cgit v1.2.3


From 0ad722f159e44983ddea1929ffd90d0c20a86f24 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 15 Jul 2022 17:36:16 +0200
Subject: PCI: Remove pci_mmap_page_range() wrapper

The ARCH_GENERIC_PCI_MMAP_RESOURCE symbol came up in a recent discussion,
and I noticed that this was left behind by an unfinished cleanup from 2017.

The only architecture that still relies on providing its own
pci_mmap_page_range() helper instead of using the generic
pci_mmap_resource_range() is sparc. Presumably the reasons for this have
not changed, but at least this can be simplified by converting sparc to use
the same interface as the others.

The only difference between the two is the device-specific offset that gets
added to or subtracted from vma->vm_pgoff.

Change the only caller of pci_mmap_page_range() in common code to subtract
this offset and call the modern interface, while adding it back in the
sparc implementation to preserve the existing behavior.

This removes the complexities of the dual interfaces from the common code,
and keeps it all specific to the sparc architecture code. According to
David Miller, the sparc code lets user space poke into the VGA I/O port
registers by mmapping the I/O space of the parent bridge device, which is
something that the generic pci_mmap_resource_range() code apparently does
not.

Link: https://lore.kernel.org/lkml/1519887203.622.3.camel@infradead.org/t/
Link: https://lore.kernel.org/lkml/20220714214657.2402250-3-shorne@gmail.com/
Link: https://lore.kernel.org/r/20220715153617.3393420-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Stafford Horne <shorne@gmail.com>
---
 Documentation/PCI/sysfs-pci.rst |  2 +-
 arch/sparc/kernel/pci.c         | 13 ++++++++----
 drivers/pci/mmap.c              | 44 -----------------------------------------
 drivers/pci/proc.c              |  7 ++++++-
 include/linux/pci.h             | 12 +----------
 5 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/PCI/sysfs-pci.rst b/Documentation/PCI/sysfs-pci.rst
index 742fbd21dc1f..f495185aa88a 100644
--- a/Documentation/PCI/sysfs-pci.rst
+++ b/Documentation/PCI/sysfs-pci.rst
@@ -125,7 +125,7 @@ implementation of that functionality. To support the historical interface of
 mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP.
 
 Alternatively, platforms which set HAVE_PCI_MMAP may provide their own
-implementation of pci_mmap_page_range() instead of defining
+implementation of pci_mmap_resource_range() instead of defining
 ARCH_GENERIC_PCI_MMAP_RESOURCE.
 
 Platforms which support write-combining maps of PCI resources must define
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 31b0c1983286..f580db840bf7 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -876,17 +876,22 @@ static void __pci_mmap_set_pgprot(struct pci_dev *dev, struct vm_area_struct *vm
 
 /* Perform the actual remap of the pages for a PCI device mapping, as appropriate
  * for this architecture.  The region in the process to map is described by vm_start
- * and vm_end members of VMA, the base physical address is found in vm_pgoff.
+ * and vm_end members of VMA, the BAR relative address is found in vm_pgoff.
  * The pci device structure is provided so that architectures may make mapping
  * decisions on a per-device or per-bus basis.
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
+int pci_mmap_resource_range(struct pci_dev *dev, int bar,
+			    struct vm_area_struct *vma,
+			    enum pci_mmap_state mmap_state, int write_combine)
 {
 	int ret;
+	resource_size_t start, end;
+
+	/* convert per-BAR address to PCI bus address */
+	pci_resource_to_user(dev, bar, &dev->resource[bar], &start, &end);
+	vma->vm_pgoff += start >> PAGE_SHIFT;
 
 	ret = __pci_mmap_make_offset(dev, vma, mmap_state);
 	if (ret < 0)
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index b8c9011987f4..4504039056d1 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -13,27 +13,6 @@
 
 #ifdef ARCH_GENERIC_PCI_MMAP_RESOURCE
 
-/*
- * Modern setup: generic pci_mmap_resource_range(), and implement the legacy
- * pci_mmap_page_range() (if needed) as a wrapper round it.
- */
-
-#ifdef HAVE_PCI_MMAP
-int pci_mmap_page_range(struct pci_dev *pdev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
-{
-	resource_size_t start, end;
-
-	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
-
-	/* Adjust vm_pgoff to be the offset within the resource */
-	vma->vm_pgoff -= start >> PAGE_SHIFT;
-	return pci_mmap_resource_range(pdev, bar, vma, mmap_state,
-				       write_combine);
-}
-#endif
-
 static const struct vm_operations_struct pci_phys_vm_ops = {
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 	.access = generic_access_phys,
@@ -70,27 +49,4 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 				  vma->vm_page_prot);
 }
 
-#elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
-
-/*
- * Legacy setup: Implement pci_mmap_resource_range() as a wrapper around
- * the architecture's pci_mmap_page_range(), converting to "user visible"
- * addresses as necessary.
- */
-
-int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
-			    struct vm_area_struct *vma,
-			    enum pci_mmap_state mmap_state, int write_combine)
-{
-	resource_size_t start, end;
-
-	/*
-	 * pci_mmap_page_range() expects the same kind of entry as coming
-	 * from /proc/bus/pci/ which is a "user visible" value. If this is
-	 * different from the resource itself, arch will do necessary fixup.
-	 */
-	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
-	vma->vm_pgoff += start >> PAGE_SHIFT;
-	return pci_mmap_page_range(pdev, bar, vma, mmap_state, write_combine);
-}
 #endif
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 31b26d8ea6cc..f967709082d6 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -244,6 +244,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct pci_dev *dev = pde_data(file_inode(file));
 	struct pci_filp_private *fpriv = file->private_data;
+	resource_size_t start, end;
 	int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM;
 
 	if (!capable(CAP_SYS_RAWIO) ||
@@ -278,7 +279,11 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 	    iomem_is_exclusive(dev->resource[i].start))
 		return -EINVAL;
 
-	ret = pci_mmap_page_range(dev, i, vma,
+	pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+
+	/* Adjust vm_pgoff to be the offset within the resource */
+	vma->vm_pgoff -= start >> PAGE_SHIFT;
+	ret = pci_mmap_resource_range(dev, i, vma,
 				  fpriv->mmap_state, write_combine);
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 81a57b498f22..060af91bafcd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1909,24 +1909,14 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 
 #include <asm/pci.h>
 
-/* These two functions provide almost identical functionality. Depending
- * on the architecture, one will be implemented as a wrapper around the
- * other (in drivers/pci/mmap.c).
- *
+/*
  * pci_mmap_resource_range() maps a specific BAR, and vm->vm_pgoff
  * is expected to be an offset within that region.
  *
- * pci_mmap_page_range() is the legacy architecture-specific interface,
- * which accepts a "user visible" resource address converted by
- * pci_resource_to_user(), as used in the legacy mmap() interface in
- * /proc/bus/pci/.
  */
 int pci_mmap_resource_range(struct pci_dev *dev, int bar,
 			    struct vm_area_struct *vma,
 			    enum pci_mmap_state mmap_state, int write_combine);
-int pci_mmap_page_range(struct pci_dev *pdev, int bar,
-			struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine);
 
 #ifndef arch_can_pci_mmap_wc
 #define arch_can_pci_mmap_wc()		0
-- 
cgit v1.2.3


From 909fcb195201f7c3aa81523cc182a4c9b52210e5 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:21 +0200
Subject: clk: divider: Introduce devm_clk_hw_register_divider_parent_hw()

Add the devres variant of clk_hw_register_divider_parent_hw() for
registering a divider clock with clk_hw parent pointer instead of parent
name.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220629225331.357308-2-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c10dc4c659e2..4e07621849e6 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -831,6 +831,25 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 	__devm_clk_hw_register_divider((dev), NULL, (name), (parent_name), NULL,   \
 				  NULL, (flags), (reg), (shift), (width),     \
 				  (clk_divider_flags), NULL, (lock))
+/**
+ * devm_clk_hw_register_divider_parent_hw - register a divider clock with the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_hw: pointer to parent clk
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_divider_parent_hw(dev, name, parent_hw, flags,   \
+					       reg, shift, width,	      \
+					       clk_divider_flags, lock)       \
+	__devm_clk_hw_register_divider((dev), NULL, (name), NULL,	      \
+				       (parent_hw), NULL, (flags), (reg),     \
+				       (shift), (width), (clk_divider_flags), \
+				       NULL, (lock))
 /**
  * devm_clk_hw_register_divider_table - register a table based divider clock
  * with the clock framework (devres variant)
-- 
cgit v1.2.3


From df63af17f3375239e0be124844f304b4a4b60665 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:22 +0200
Subject: clk: mux: Introduce devm_clk_hw_register_mux_parent_hws()

Add the devres variant of clk_hw_register_mux_hws() for registering a
mux clock with clk_hw parent pointers instead of parent names.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220629225331.357308-3-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 4e07621849e6..316c7e082934 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -980,6 +980,13 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 			      (parent_names), NULL, NULL, (flags), (reg),     \
 			      (shift), BIT((width)) - 1, (clk_mux_flags),     \
 			      NULL, (lock))
+#define devm_clk_hw_register_mux_parent_hws(dev, name, parent_hws,	      \
+					    num_parents, flags, reg, shift,   \
+					    width, clk_mux_flags, lock)       \
+	__devm_clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL,  \
+				   (parent_hws), NULL, (flags), (reg),        \
+				   (shift), BIT((width)) - 1,		      \
+				   (clk_mux_flags), NULL, (lock))
 
 int clk_mux_val_to_index(struct clk_hw *hw, const u32 *table, unsigned int flags,
 			 unsigned int val);
-- 
cgit v1.2.3


From 6ebd5247ad2aa210b3ff4481c6ed8aa32a032b12 Mon Sep 17 00:00:00 2001
From: Marijn Suijten <marijn.suijten@somainline.org>
Date: Thu, 30 Jun 2022 00:53:23 +0200
Subject: clk: fixed-factor: Introduce
 *clk_hw_register_fixed_factor_parent_hw()

Add the devres and non-devres variant of
clk_hw_register_fixed_factor_parent_hw() for registering a fixed factor
clock with clk_hw parent pointer instead of parent name.

Signed-off-by: Marijn Suijten <marijn.suijten@somainline.org>
Link: https://lore.kernel.org/r/20220629225331.357308-4-marijn.suijten@somainline.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-fixed-factor.c | 45 +++++++++++++++++++++++++++++++++++++-----
 include/linux/clk-provider.h   |  8 ++++++++
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index e6b36247c16b..f734e34735a9 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -78,7 +78,8 @@ static void devm_clk_hw_register_fixed_factor_release(struct device *dev, void *
 
 static struct clk_hw *
 __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
-		const char *name, const char *parent_name, int index,
+		const char *name, const char *parent_name,
+		const struct clk_hw *parent_hw, int index,
 		unsigned long flags, unsigned int mult, unsigned int div,
 		bool devm)
 {
@@ -110,6 +111,8 @@ __clk_hw_register_fixed_factor(struct device *dev, struct device_node *np,
 	init.flags = flags;
 	if (parent_name)
 		init.parent_names = &parent_name;
+	else if (parent_hw)
+		init.parent_hws = &parent_hw;
 	else
 		init.parent_data = &pdata;
 	init.num_parents = 1;
@@ -148,16 +151,48 @@ struct clk_hw *devm_clk_hw_register_fixed_factor_index(struct device *dev,
 		const char *name, unsigned int index, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, index,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, NULL, index,
 					      flags, mult, div, true);
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor_index);
 
+/**
+ * devm_clk_hw_register_fixed_factor_parent_hw - Register a fixed factor clock with
+ * pointer to parent clock
+ * @dev: device that is registering this clock
+ * @name: name of this clock
+ * @parent_hw: pointer to parent clk
+ * @flags: fixed factor flags
+ * @mult: multiplier
+ * @div: divider
+ *
+ * Return: Pointer to fixed factor clk_hw structure that was registered or
+ * an error pointer.
+ */
+struct clk_hw *devm_clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div)
+{
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL, parent_hw,
+					      -1, flags, mult, div, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor_parent_hw);
+
+struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div)
+{
+	return __clk_hw_register_fixed_factor(dev, NULL, name, NULL,
+					      parent_hw, -1, flags, mult, div,
+					      false);
+}
+EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor_parent_hw);
+
 struct clk_hw *clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, NULL, -1,
 					      flags, mult, div, false);
 }
 EXPORT_SYMBOL_GPL(clk_hw_register_fixed_factor);
@@ -204,7 +239,7 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
 		const char *name, const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div)
 {
-	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, -1,
+	return __clk_hw_register_fixed_factor(dev, NULL, name, parent_name, NULL, -1,
 			flags, mult, div, true);
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_register_fixed_factor);
@@ -231,7 +266,7 @@ static struct clk_hw *_of_fixed_factor_clk_setup(struct device_node *node)
 
 	of_property_read_string(node, "clock-output-names", &clk_name);
 
-	hw = __clk_hw_register_fixed_factor(NULL, node, clk_name, NULL, 0,
+	hw = __clk_hw_register_fixed_factor(NULL, node, clk_name, NULL, NULL, 0,
 					    0, mult, div, false);
 	if (IS_ERR(hw)) {
 		/*
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 316c7e082934..94458cb669f0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1032,6 +1032,14 @@ struct clk_hw *devm_clk_hw_register_fixed_factor(struct device *dev,
 struct clk_hw *devm_clk_hw_register_fixed_factor_index(struct device *dev,
 		const char *name, unsigned int index, unsigned long flags,
 		unsigned int mult, unsigned int div);
+
+struct clk_hw *devm_clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div);
+
+struct clk_hw *clk_hw_register_fixed_factor_parent_hw(struct device *dev,
+		const char *name, const struct clk_hw *parent_hw,
+		unsigned long flags, unsigned int mult, unsigned int div);
 /**
  * struct clk_fractional_divider - adjustable fractional divider clock
  *
-- 
cgit v1.2.3


From c770f31d8f580ed4b965c64f924ec1cc50e41734 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 19 Jul 2022 09:18:35 -0400
Subject: SUNRPC: Fix xdr_encode_bool()

I discovered that xdr_encode_bool() was returning the same address
that was passed in the @p parameter. The documenting comment states
that the intent is to return the address of the next buffer
location, just like the other "xdr_encode_*" helpers.

The result was the encoded results of NFSv3 PATHCONF operations were
not formed correctly.

Fixes: ded04a587f6c ("NFSD: Update the NFSv3 PATHCONF3res encoder to use struct xdr_stream")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/sunrpc/xdr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5860f32e3958..986c8a17ca5e 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -419,8 +419,8 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
  */
 static inline __be32 *xdr_encode_bool(__be32 *p, u32 n)
 {
-	*p = n ? xdr_one : xdr_zero;
-	return p++;
+	*p++ = n ? xdr_one : xdr_zero;
+	return p;
 }
 
 /**
-- 
cgit v1.2.3


From 184cefbe62627730c30282df12bcff9aae4816ea Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Mon, 13 Jun 2022 09:40:06 -0400
Subject: NLM: Defend against file_lock changes after vfs_test_lock()

Instead of trusting that struct file_lock returns completely unchanged
after vfs_test_lock() when there's no conflicting lock, stash away our
nlm_lockowner reference so we can properly release it for all cases.

This defends against another file_lock implementation overwriting fl_owner
when the return type is F_UNLCK.

Reported-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Tested-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c         |  4 +++-
 fs/lockd/svclock.c          | 10 +---------
 fs/lockd/svcproc.c          |  5 ++++-
 include/linux/lockd/lockd.h |  1 +
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 176b468a61c7..4f247ab8be61 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -87,6 +87,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	struct nlm_args *argp = rqstp->rq_argp;
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	struct nlm_lockowner *test_owner;
 	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST4        called\n");
@@ -96,6 +97,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
+	test_owner = argp->lock.fl.fl_owner;
 	/* Now check for conflicting locks */
 	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
@@ -103,7 +105,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	else
 		dprintk("lockd: TEST4        status %d\n", ntohl(resp->status));
 
-	nlmsvc_release_lockowner(&argp->lock);
+	nlmsvc_put_lockowner(test_owner);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cb3658ab9b7a..9c1aa75441e1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -340,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
 	return lockowner;
 }
 
-static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
 {
 	if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
 		return;
@@ -590,7 +590,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	int			error;
 	int			mode;
 	__be32			ret;
-	struct nlm_lockowner	*test_owner;
 
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				nlmsvc_file_inode(file)->i_sb->s_id,
@@ -604,9 +603,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
-	/* If there's a conflicting lock, remember to clean up the test lock */
-	test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
-
 	mode = lock_to_openmode(&lock->fl);
 	error = vfs_test_lock(file->f_file[mode], &lock->fl);
 	if (error) {
@@ -635,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	conflock->fl.fl_end = lock->fl.fl_end;
 	locks_release_private(&lock->fl);
 
-	/* Clean up the test lock */
-	lock->fl.fl_owner = NULL;
-	nlmsvc_put_lockowner(test_owner);
-
 	ret = nlm_lck_denied;
 out:
 	return ret;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4dc1b40a489a..b09ca35b527c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -116,6 +116,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	struct nlm_args *argp = rqstp->rq_argp;
 	struct nlm_host	*host;
 	struct nlm_file	*file;
+	struct nlm_lockowner *test_owner;
 	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST          called\n");
@@ -125,6 +126,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
+	test_owner = argp->lock.fl.fl_owner;
+
 	/* Now check for conflicting locks */
 	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
 	if (resp->status == nlm_drop_reply)
@@ -133,7 +136,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 		dprintk("lockd: TEST          status %d vers %d\n",
 			ntohl(resp->status), rqstp->rq_vers);
 
-	nlmsvc_release_lockowner(&argp->lock);
+	nlmsvc_put_lockowner(test_owner);
 	nlmsvc_release_host(host);
 	nlm_release_file(file);
 	return rc;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index fcef192e5e45..70ce419e2709 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -292,6 +292,7 @@ void		  nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t);
 __be32		  nlm_lookup_file(struct svc_rqst *, struct nlm_file **,
 					struct nlm_lock *);
 void		  nlm_release_file(struct nlm_file *);
+void		  nlmsvc_put_lockowner(struct nlm_lockowner *);
 void		  nlmsvc_release_lockowner(struct nlm_lock *);
 void		  nlmsvc_mark_resources(struct net *);
 void		  nlmsvc_free_host_resources(struct nlm_host *);
-- 
cgit v1.2.3


From 5304877936c0a67e1a01464d113bae4c81eacdb6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 27 Jul 2022 14:40:03 -0400
Subject: NFSD: Fix strncpy() fortify warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In function ‘strncpy’,
    inlined from ‘nfsd4_ssc_setup_dul’ at /home/cel/src/linux/manet/fs/nfsd/nfs4proc.c:1392:3,
    inlined from ‘nfsd4_interssc_connect’ at /home/cel/src/linux/manet/fs/nfsd/nfs4proc.c:1489:11:
/home/cel/src/linux/manet/include/linux/fortify-string.h:52:33: warning: ‘__builtin_strncpy’ specified bound 63 equals destination size [-Wstringop-truncation]
   52 | #define __underlying_strncpy    __builtin_strncpy
      |                                 ^
/home/cel/src/linux/manet/include/linux/fortify-string.h:89:16: note: in expansion of macro ‘__underlying_strncpy’
   89 |         return __underlying_strncpy(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~~

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c      | 2 +-
 include/linux/nfs_ssc.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5af9f8d1feb6..660bd49f3a32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1389,7 +1389,7 @@ try_again:
 		return 0;
 	}
 	if (work) {
-		strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+		strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
 		refcount_set(&work->nsui_refcnt, 2);
 		work->nsui_busy = true;
 		list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
index 222ae8883e85..75843c00f326 100644
--- a/include/linux/nfs_ssc.h
+++ b/include/linux/nfs_ssc.h
@@ -64,7 +64,7 @@ struct nfsd4_ssc_umount_item {
 	refcount_t nsui_refcnt;
 	unsigned long nsui_expire;
 	struct vfsmount *nsui_vfsmount;
-	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN];
+	char nsui_ipaddr[RPC_MAX_ADDRBUFLEN + 1];
 };
 #endif
 
-- 
cgit v1.2.3


From fef3e9066d19230f661048ca86937d954c12cd50 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 14 Jul 2022 16:41:47 +0800
Subject: writeback: remove inode_to_wb_is_valid()

inode_to_wb_is_valid() is no longer used since commit fe55d563d417
("remove inode_congested()"), remove it.

Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e84b745a6811..439815cc1ab9 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return wb;
 }
 
-/**
- * inode_to_wb_is_valid - test whether an inode has a wb associated
- * @inode: inode of interest
- *
- * Returns %true if @inode has a wb associated.  May be called without any
- * locking.
- */
-static inline bool inode_to_wb_is_valid(struct inode *inode)
-{
-	return inode->i_wb;
-}
-
 /**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
@@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
-static inline bool inode_to_wb_is_valid(struct inode *inode)
-{
-	return true;
-}
-
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
-- 
cgit v1.2.3


From 73b73bac90d97400e29e585c678c4d0ebfd2680d Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Thu, 14 Jul 2022 06:49:18 +0000
Subject: mm: vmpressure: don't count proactive reclaim in vmpressure

memory.reclaim is a cgroup v2 interface that allows users to proactively
reclaim memory from a memcg, without real memory pressure.  Reclaim
operations invoke vmpressure, which is used: (a) To notify userspace of
reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being
under memory pressure for networking (see
mem_cgroup_under_socket_pressure()).

For (a), vmpressure notifications in v1 are not affected by this change
since memory.reclaim is a v2 feature.

For (b), the effects of the vmpressure signal (according to Shakeel [1])
are as follows:
1. Reducing send and receive buffers of the current socket.
2. May drop packets on the rx path.
3. May throttle current thread on the tx path.

Since proactive reclaim is invoked directly by userspace, not by memory
pressure, it makes sense not to throttle networking.  Hence, this change
makes sure that proactive reclaim caused by memory.reclaim does not
trigger vmpressure.

[1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/

[yosryahmed@google.com: update documentation]
  Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com
Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: NeilBrown <neilb@suse.de>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst |  7 +++++++
 include/linux/swap.h                    |  5 ++++-
 mm/memcontrol.c                         | 24 ++++++++++++++----------
 mm/vmscan.c                             | 27 +++++++++++++++++----------
 4 files changed, 42 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index ad9ba3ec90a5..376d0207d1f7 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1229,6 +1229,13 @@ PAGE_SIZE multiple when read back.
 	the target cgroup. If less bytes are reclaimed than the
 	specified amount, -EAGAIN is returned.
 
+	Please note that the proactive reclaim (triggered by this
+	interface) is not meant to indicate memory pressure on the
+	memory cgroup. Therefore socket memory balancing triggered by
+	the memory reclaim normally is not exercised in this case.
+	This means that the networking layer will not adapt based on
+	reclaim induced by memory.reclaim.
+
   memory.peak
 	A read-only single value file which exists on non-root
 	cgroups.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6d11c51b2b62..ea895b40e6ff 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
+
+#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
+#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  bool may_swap);
+						  unsigned int reclaim_options);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 767f49a6b987..2b831cc48c7d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2330,7 +2330,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
-							     gfp_mask, true);
+							gfp_mask,
+							MEMCG_RECLAIM_MAY_SWAP);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2575,7 +2576,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	bool passed_oom = false;
-	bool may_swap = true;
+	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
 	bool drained = false;
 	bool raised_max_event = false;
 	unsigned long pflags;
@@ -2593,7 +2594,7 @@ retry:
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
 	} else {
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-		may_swap = false;
+		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
 	}
 
 	if (batch > nr_pages) {
@@ -2621,7 +2622,7 @@ retry:
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, may_swap);
+						    gfp_mask, reclaim_options);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3439,8 +3440,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 			continue;
 		}
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1,
-					GFP_KERNEL, !memsw)) {
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3550,7 +3551,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+						  MEMCG_RECLAIM_MAY_SWAP))
 			nr_retries--;
 	}
 
@@ -6302,7 +6304,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-							 GFP_KERNEL, true);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6351,7 +6353,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-							  GFP_KERNEL, true))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
 				nr_reclaims--;
 			continue;
 		}
@@ -6480,6 +6482,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	unsigned int reclaim_options;
 	int err;
 
 	buf = strstrip(buf);
@@ -6487,6 +6490,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	if (err)
 		return err;
 
+	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
 		unsigned long reclaimed;
 
@@ -6503,7 +6507,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 						nr_to_reclaim - nr_reclaimed,
-						GFP_KERNEL, true);
+						GFP_KERNEL, reclaim_options);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fbb4108250ee..9e7d8db42918 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -101,6 +101,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Proactive reclaim invoked by userspace through memory.reclaim */
+	unsigned int proactive:1;
+
 	/*
 	 * Cgroup memory below memory.low is protected as long as we
 	 * don't threaten to OOM. If any cgroup is reclaimed at
@@ -3180,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			    sc->priority);
 
 		/* Record the group's reclaim efficiency */
-		vmpressure(sc->gfp_mask, memcg, false,
-			   sc->nr_scanned - scanned,
-			   sc->nr_reclaimed - reclaimed);
+		if (!sc->proactive)
+			vmpressure(sc->gfp_mask, memcg, false,
+				   sc->nr_scanned - scanned,
+				   sc->nr_reclaimed - reclaimed);
 
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }
@@ -3305,9 +3309,10 @@ again:
 	}
 
 	/* Record the subtree's reclaim efficiency */
-	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-		   sc->nr_scanned - nr_scanned,
-		   sc->nr_reclaimed - nr_reclaimed);
+	if (!sc->proactive)
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+			   sc->nr_scanned - nr_scanned,
+			   sc->nr_reclaimed - nr_reclaimed);
 
 	if (sc->nr_reclaimed - nr_reclaimed)
 		reclaimable = true;
@@ -3589,8 +3594,9 @@ retry:
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
 	do {
-		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
-				sc->priority);
+		if (!sc->proactive)
+			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+					sc->priority);
 		sc->nr_scanned = 0;
 		shrink_zones(zonelist, sc);
 
@@ -3880,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool may_swap)
+					   unsigned int reclaim_options)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
@@ -3893,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.priority = DEF_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = may_swap,
+		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
+		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
-- 
cgit v1.2.3


From e408e695f5f1f60d784913afc45ff2c387a5aeb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 Jul 2022 21:59:12 -0400
Subject: mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs

This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL,
FS_NODUMP_FL, etc., like all other standard Linux file systems.

[akpm@linux-foundation.org: fix CONFIG_TMPFS_XATTR=n warnings]
Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 11 +++++++++
 mm/shmem.c               | 64 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a68f982f22d1..1b6c4013f691 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -25,9 +25,20 @@ struct shmem_inode_info {
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	atomic_t		stop_eviction;	/* hold when working on inode */
 	struct timespec64	i_crtime;	/* file creation time */
+	unsigned int		fsflags;	/* flags for FS_IOC_[SG]ETFLAGS */
 	struct inode		vfs_inode;
 };
 
+#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE
+#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
 	struct percpu_counter used_blocks;  /* How many are allocated */
diff --git a/mm/shmem.c b/mm/shmem.c
index 12ac67dc831f..06871a913b49 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
+#include <linux/fileattr.h>
 #include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/sched/signal.h>
@@ -1058,6 +1059,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
 		shmem_recalc_inode(inode);
 		spin_unlock_irq(&info->lock);
 	}
+	if (info->fsflags & FS_APPEND_FL)
+		stat->attributes |= STATX_ATTR_APPEND;
+	if (info->fsflags & FS_IMMUTABLE_FL)
+		stat->attributes |= STATX_ATTR_IMMUTABLE;
+	if (info->fsflags & FS_NODUMP_FL)
+		stat->attributes |= STATX_ATTR_NODUMP;
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+			STATX_ATTR_IMMUTABLE |
+			STATX_ATTR_NODUMP);
 	generic_fillattr(&init_user_ns, inode, stat);
 
 	if (shmem_is_huge(NULL, inode, 0))
@@ -2272,7 +2282,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+/* Mask out flags that are inappropriate for the given type of inode. */
+static unsigned shmem_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & SHMEM_REG_FLMASK;
+	else
+		return flags & SHMEM_OTHER_FLMASK;
+}
+
+static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
 				     umode_t mode, dev_t dev, unsigned long flags)
 {
 	struct inode *inode;
@@ -2297,6 +2318,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		info->seals = F_SEAL_SEAL;
 		info->flags = flags & VM_NORESERVE;
 		info->i_crtime = inode->i_mtime;
+		info->fsflags = (dir == NULL) ? 0 :
+			SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+		info->fsflags = shmem_mask_flags(mode, info->fsflags);
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
 		simple_xattrs_init(&info->xattrs);
@@ -3138,6 +3162,40 @@ static const char *shmem_get_link(struct dentry *dentry,
 }
 
 #ifdef CONFIG_TMPFS_XATTR
+
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+
+	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
+
+	return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+			      struct dentry *dentry, struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+
+	if (fileattr_has_fsx(fa))
+		return -EOPNOTSUPP;
+
+	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
+
+	inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME);
+	if (info->fsflags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (info->fsflags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (info->fsflags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+
+	inode->i_ctime = current_time(inode);
+	return 0;
+}
+
 /*
  * Superblocks without xattr inode operations may get some security.* xattr
  * support from the LSM "for free". As soon as we have any other xattrs
@@ -3828,6 +3886,8 @@ static const struct inode_operations shmem_inode_operations = {
 #ifdef CONFIG_TMPFS_XATTR
 	.listxattr	= shmem_listxattr,
 	.set_acl	= simple_set_acl,
+	.fileattr_get	= shmem_fileattr_get,
+	.fileattr_set	= shmem_fileattr_set,
 #endif
 };
 
@@ -3847,6 +3907,8 @@ static const struct inode_operations shmem_dir_inode_operations = {
 #endif
 #ifdef CONFIG_TMPFS_XATTR
 	.listxattr	= shmem_listxattr,
+	.fileattr_get	= shmem_fileattr_get,
+	.fileattr_set	= shmem_fileattr_set,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
-- 
cgit v1.2.3


From bb077c3ffd5362a6d9e60574e1bcc83fe8e3fb27 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 26 Jul 2022 21:18:16 +0800
Subject: mm: cleanup is_highmem()

It is unnecessary to add CONFIG_HIGHMEM check in is_highmem(), which has
been done in is_highmem_idx(), and move is_highmem() close to
is_highmem_idx().  This has no functional impact.

Link: https://lkml.kernel.org/r/20220726131816.149075-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 578247a341b2..e24b40c52468 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx)
 #endif
 }
 
-#ifdef CONFIG_ZONE_DMA
-bool has_managed_dma(void);
-#else
-static inline bool has_managed_dma(void)
-{
-	return false;
-}
-#endif
-
 /**
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
@@ -1155,12 +1146,17 @@ static inline bool has_managed_dma(void)
  */
 static inline int is_highmem(struct zone *zone)
 {
-#ifdef CONFIG_HIGHMEM
 	return is_highmem_idx(zone_idx(zone));
+}
+
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void);
 #else
-	return 0;
-#endif
+static inline bool has_managed_dma(void)
+{
+	return false;
 }
+#endif
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-- 
cgit v1.2.3


From fa7d574ba4f4f3f4f78d432c8545d9045daa89b1 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Tue, 19 Jul 2022 16:33:49 +0800
Subject: bdi: remove enum wb_congested_state

enum wb_congested_state and the member 'congested' in bdi_writeback are
useless since commit a88f2096d5a2 ("remove congestion tracking
framework"), so remove it.

Link: https://lkml.kernel.org/r/20220719083349.87547-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/backing-dev-defs.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e863c88df95f..ae12696ec492 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -28,11 +28,6 @@ enum wb_state {
 	WB_start_all,		/* nr_pages == 0 (all) work pending */
 };
 
-enum wb_congested_state {
-	WB_async_congested,	/* The async (write) queue is getting full */
-	WB_sync_congested,	/* The sync queue is getting full */
-};
-
 enum wb_stat_item {
 	WB_RECLAIMABLE,
 	WB_WRITEBACK,
@@ -122,8 +117,6 @@ struct bdi_writeback {
 	atomic_t writeback_inodes;	/* number of inodes under writeback */
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
-	unsigned long congested;	/* WB_[a]sync_congested flags */
-
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
-- 
cgit v1.2.3


From 102227b970a15256f5ffd12a6a276ddf978e6caf Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:40 +0200
Subject: rv: Add Runtime Verification (RV) interface

RV is a lightweight (yet rigorous) method that complements classical
exhaustive verification techniques (such as model checking and
theorem proving) with a more practical approach to complex systems.

RV works by analyzing the trace of the system's actual execution,
comparing it against a formal specification of the system behavior.
RV can give precise information on the runtime behavior of the
monitored system while enabling the reaction for unexpected
events, avoiding, for example, the propagation of a failure on
safety-critical systems.

The development of this interface roots in the development of the
paper:

De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
Silva. Efficient formal verification for the Linux kernel. In:
International Conference on Software Engineering and Formal Methods.
Springer, Cham, 2019. p. 315-332.

And:

De Oliveira, Daniel Bristot. Automata-based formal analysis
and verification of the real-time Linux kernel. PhD Thesis, 2020.

The RV interface resembles the tracing/ interface on purpose. The current
path for the RV interface is /sys/kernel/tracing/rv/.

It presents these files:

 "available_monitors"
   - List the available monitors, one per line.

   For example:
     # cat available_monitors
     wip
     wwnr

 "enabled_monitors"
   - Lists the enabled monitors, one per line;
   - Writing to it enables a given monitor;
   - Writing a monitor name with a '!' prefix disables it;
   - Truncating the file disables all enabled monitors.

   For example:
     # cat enabled_monitors
     # echo wip > enabled_monitors
     # echo wwnr >> enabled_monitors
     # cat enabled_monitors
     wip
     wwnr
     # echo '!wip' >> enabled_monitors
     # cat enabled_monitors
     wwnr
     # echo > enabled_monitors
     # cat enabled_monitors
     #

   Note that more than one monitor can be enabled concurrently.

 "monitoring_on"
   - It is an on/off general switcher for monitoring. Note
   that it does not disable enabled monitors or detach events,
   but stop the per-entity monitors of monitoring the events
   received from the system. It resembles the "tracing_on" switcher.

 "monitors/"
   Each monitor will have its one directory inside "monitors/". There
   the monitor specific files will be presented.
   The "monitors/" directory resembles the "events" directory on
   tracefs.

   For example:
     # cd monitors/wip/
     # ls
     desc  enable
     # cat desc
     wakeup in preemptive per-cpu testing monitor.
     # cat enable
     0

For further information, see the comments in the header of
kernel/trace/rv/rv.c from this patch.

Link: https://lkml.kernel.org/r/a4bfe038f50cb047bfb343ad0e12b0e646ab308b.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h       |  43 +++
 include/linux/sched.h    |  11 +
 kernel/trace/Kconfig     |   2 +
 kernel/trace/Makefile    |   1 +
 kernel/trace/rv/Kconfig  |  12 +
 kernel/trace/rv/Makefile |   3 +
 kernel/trace/rv/rv.c     | 782 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/rv/rv.h     |  33 ++
 kernel/trace/trace.c     |   2 +
 kernel/trace/trace.h     |   9 +
 10 files changed, 898 insertions(+)
 create mode 100644 include/linux/rv.h
 create mode 100644 kernel/trace/rv/Kconfig
 create mode 100644 kernel/trace/rv/Makefile
 create mode 100644 kernel/trace/rv/rv.c
 create mode 100644 kernel/trace/rv/rv.h

(limited to 'include/linux')

diff --git a/include/linux/rv.h b/include/linux/rv.h
new file mode 100644
index 000000000000..d8fa9e8be94a
--- /dev/null
+++ b/include/linux/rv.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Runtime Verification.
+ *
+ * For futher information, see: kernel/trace/rv/rv.c.
+ */
+#ifndef _LINUX_RV_H
+#define _LINUX_RV_H
+
+#ifdef CONFIG_RV
+
+/*
+ * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
+ * If we find justification for more monitors, we can think about
+ * adding more or developing a dynamic method. So far, none of
+ * these are justified.
+ */
+#define RV_PER_TASK_MONITORS		1
+#define RV_PER_TASK_MONITOR_INIT	(RV_PER_TASK_MONITORS)
+
+/*
+ * Futher monitor types are expected, so make this a union.
+ */
+union rv_task_monitor {
+};
+
+struct rv_monitor {
+	const char		*name;
+	const char		*description;
+	bool			enabled;
+	int			(*enable)(void);
+	void			(*disable)(void);
+	void			(*reset)(void);
+};
+
+bool rv_monitoring_on(void);
+int rv_unregister_monitor(struct rv_monitor *monitor);
+int rv_register_monitor(struct rv_monitor *monitor);
+int rv_get_task_monitor_slot(void);
+void rv_put_task_monitor_slot(int slot);
+
+#endif /* CONFIG_RV */
+#endif /* _LINUX_RV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..b5da3e7c3a04 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/rseq.h>
 #include <linux/seqlock.h>
 #include <linux/kcsan.h>
+#include <linux/rv.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1500,6 +1501,16 @@ struct task_struct {
 	struct callback_head		l1d_flush_kill;
 #endif
 
+#ifdef CONFIG_RV
+	/*
+	 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
+	 * If we find justification for more monitors, we can think
+	 * about adding more or developing a dynamic method. So far,
+	 * none of these are justified.
+	 */
+	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ccd6a5ade3e9..1052126bdca2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1106,4 +1106,6 @@ config HIST_TRIGGERS_DEBUG
 
           If unsure, say N.
 
+source "kernel/trace/rv/Kconfig"
+
 endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0d261774d6f3..c6651e16b557 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -106,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o
 obj-$(CONFIG_RETHOOK) += rethook.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+obj-$(CONFIG_RV) += rv/
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
new file mode 100644
index 000000000000..6d127cdb00dd
--- /dev/null
+++ b/kernel/trace/rv/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+menuconfig RV
+	bool "Runtime Verification"
+	depends on TRACING
+	help
+	  Enable the kernel runtime verification infrastructure. RV is a
+	  lightweight (yet rigorous) method that complements classical
+	  exhaustive verification techniques (such as model checking and
+	  theorem proving). RV works by analyzing the trace of the system's
+	  actual execution, comparing it against a formal specification of
+	  the system behavior.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
new file mode 100644
index 000000000000..fd995379df67
--- /dev/null
+++ b/kernel/trace/rv/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_RV) += rv.o
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
new file mode 100644
index 000000000000..731cc961cc70
--- /dev/null
+++ b/kernel/trace/rv/rv.c
@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * This is the online Runtime Verification (RV) interface.
+ *
+ * RV is a lightweight (yet rigorous) method that complements classical
+ * exhaustive verification techniques (such as model checking and
+ * theorem proving) with a more practical approach to complex systems.
+ *
+ * RV works by analyzing the trace of the system's actual execution,
+ * comparing it against a formal specification of the system behavior.
+ * RV can give precise information on the runtime behavior of the
+ * monitored system while enabling the reaction for unexpected
+ * events, avoiding, for example, the propagation of a failure on
+ * safety-critical systems.
+ *
+ * The development of this interface roots in the development of the
+ * paper:
+ *
+ * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo
+ * Silva. Efficient formal verification for the Linux kernel. In:
+ * International Conference on Software Engineering and Formal Methods.
+ * Springer, Cham, 2019. p. 315-332.
+ *
+ * And:
+ *
+ * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis
+ * and verification of the real-time Linux kernel. PhD Thesis, 2020.
+ *
+ * == Runtime monitor interface ==
+ *
+ * A monitor is the central part of the runtime verification of a system.
+ *
+ * The monitor stands in between the formal specification of the desired
+ * (or undesired) behavior, and the trace of the actual system.
+ *
+ * In Linux terms, the runtime verification monitors are encapsulated
+ * inside the "RV monitor" abstraction. A RV monitor includes a reference
+ * model of the system, a set of instances of the monitor (per-cpu monitor,
+ * per-task monitor, and so on), and the helper functions that glue the
+ * monitor to the system via trace. Generally, a monitor includes some form
+ * of trace output as a reaction for event parsing and exceptions,
+ * as depicted bellow:
+ *
+ * Linux  +----- RV Monitor ----------------------------------+ Formal
+ *  Realm |                                                   |  Realm
+ *  +-------------------+     +----------------+     +-----------------+
+ *  |   Linux kernel    |     |     Monitor    |     |     Reference   |
+ *  |     Tracing       |  -> |   Instance(s)  | <-  |       Model     |
+ *  | (instrumentation) |     | (verification) |     | (specification) |
+ *  +-------------------+     +----------------+     +-----------------+
+ *         |                          |                       |
+ *         |                          V                       |
+ *         |                     +----------+                 |
+ *         |                     | Reaction |                 |
+ *         |                     +--+--+--+-+                 |
+ *         |                        |  |  |                   |
+ *         |                        |  |  +-> trace output ?  |
+ *         +------------------------|--|----------------------+
+ *                                  |  +----> panic ?
+ *                                  +-------> <user-specified>
+ *
+ * This file implements the interface for loading RV monitors, and
+ * to control the verification session.
+ *
+ * == Registering monitors ==
+ *
+ * The struct rv_monitor defines a set of callback functions to control
+ * a verification session. For instance, when a given monitor is enabled,
+ * the "enable" callback function is called to hook the instrumentation
+ * functions to the kernel trace events. The "disable" function is called
+ * when disabling the verification session.
+ *
+ * A RV monitor is registered via:
+ *   int rv_register_monitor(struct rv_monitor *monitor);
+ * And unregistered via:
+ *   int rv_unregister_monitor(struct rv_monitor *monitor);
+ *
+ * == User interface ==
+ *
+ * The user interface resembles kernel tracing interface. It presents
+ * these files:
+ *
+ *  "available_monitors"
+ *    - List the available monitors, one per line.
+ *
+ *    For example:
+ *      # cat available_monitors
+ *      wip
+ *      wwnr
+ *
+ *  "enabled_monitors"
+ *    - Lists the enabled monitors, one per line;
+ *    - Writing to it enables a given monitor;
+ *    - Writing a monitor name with a '!' prefix disables it;
+ *    - Truncating the file disables all enabled monitors.
+ *
+ *    For example:
+ *      # cat enabled_monitors
+ *      # echo wip > enabled_monitors
+ *      # echo wwnr >> enabled_monitors
+ *      # cat enabled_monitors
+ *      wip
+ *      wwnr
+ *      # echo '!wip' >> enabled_monitors
+ *      # cat enabled_monitors
+ *      wwnr
+ *      # echo > enabled_monitors
+ *      # cat enabled_monitors
+ *      #
+ *
+ *    Note that more than one monitor can be enabled concurrently.
+ *
+ *  "monitoring_on"
+ *    - It is an on/off general switcher for monitoring. Note
+ *    that it does not disable enabled monitors or detach events,
+ *    but stops the per-entity monitors from monitoring the events
+ *    received from the instrumentation. It resembles the "tracing_on"
+ *    switcher.
+ *
+ *  "monitors/"
+ *    Each monitor will have its own directory inside "monitors/". There
+ *    the monitor specific files will be presented.
+ *    The "monitors/" directory resembles the "events" directory on
+ *    tracefs.
+ *
+ *    For example:
+ *      # cd monitors/wip/
+ *      # ls
+ *      desc  enable
+ *      # cat desc
+ *      auto-generated wakeup in preemptive monitor.
+ *      # cat enable
+ *      0
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include "rv.h"
+
+DEFINE_MUTEX(rv_interface_lock);
+
+static struct rv_interface rv_root;
+
+struct dentry *get_monitors_root(void)
+{
+	return rv_root.monitors_dir;
+}
+
+/*
+ * Interface for the monitor register.
+ */
+static LIST_HEAD(rv_monitors_list);
+
+static int task_monitor_count;
+static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+
+int rv_get_task_monitor_slot(void)
+{
+	int i;
+
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (task_monitor_count == RV_PER_TASK_MONITORS)
+		return -EBUSY;
+
+	task_monitor_count++;
+
+	for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+		if (task_monitor_slots[i] == false) {
+			task_monitor_slots[i] = true;
+			return i;
+		}
+	}
+
+	WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n");
+
+	return -EINVAL;
+}
+
+void rv_put_task_monitor_slot(int slot)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+		WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
+		return;
+	}
+
+	WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n",
+		  slot);
+
+	task_monitor_count--;
+	task_monitor_slots[slot] = false;
+}
+
+/*
+ * This section collects the monitor/ files and folders.
+ */
+static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
+					loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	const char *buff;
+
+	buff = mdef->monitor->enabled ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+/*
+ * __rv_disable_monitor - disabled an enabled monitor
+ */
+static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (mdef->monitor->enabled) {
+		mdef->monitor->enabled = 0;
+		mdef->monitor->disable();
+
+		/*
+		 * Wait for the execution of all events to finish.
+		 * Otherwise, the data used by the monitor could
+		 * be inconsistent. i.e., if the monitor is re-enabled.
+		 */
+		if (sync)
+			tracepoint_synchronize_unregister();
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * rv_disable_monitor - disable a given runtime monitor
+ *
+ * Returns 0 on success.
+ */
+int rv_disable_monitor(struct rv_monitor_def *mdef)
+{
+	__rv_disable_monitor(mdef, true);
+	return 0;
+}
+
+/**
+ * rv_enable_monitor - enable a given runtime monitor
+ *
+ * Returns 0 on success, error otherwise.
+ */
+int rv_enable_monitor(struct rv_monitor_def *mdef)
+{
+	int retval;
+
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (mdef->monitor->enabled)
+		return 0;
+
+	retval = mdef->monitor->enable();
+
+	if (!retval)
+		mdef->monitor->enabled = 1;
+
+	return retval;
+}
+
+/*
+ * interface for enabling/disabling a monitor.
+ */
+static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
+					 size_t count, loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	retval = count;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		retval = rv_enable_monitor(mdef);
+	else
+		retval = rv_disable_monitor(mdef);
+
+	mutex_unlock(&rv_interface_lock);
+
+	return retval ? : count;
+}
+
+static const struct file_operations interface_enable_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = monitor_enable_write_data,
+	.read   = monitor_enable_read_data,
+};
+
+/*
+ * Interface to read monitors description.
+ */
+static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
+				      loff_t *ppos)
+{
+	struct rv_monitor_def *mdef = filp->private_data;
+	char buff[256];
+
+	memset(buff, 0, sizeof(buff));
+
+	snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static const struct file_operations interface_desc_fops = {
+	.open   = simple_open,
+	.llseek	= no_llseek,
+	.read	= monitor_desc_read_data,
+};
+
+/*
+ * During the registration of a monitor, this function creates
+ * the monitor dir, where the specific options of the monitor
+ * are exposed.
+ */
+static int create_monitor_dir(struct rv_monitor_def *mdef)
+{
+	struct dentry *root = get_monitors_root();
+	const char *name = mdef->monitor->name;
+	struct dentry *tmp;
+	int retval;
+
+	mdef->root_d = rv_create_dir(name, root);
+	if (!mdef->root_d)
+		return -ENOMEM;
+
+	tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+	if (!tmp) {
+		retval = -ENOMEM;
+		goto out_remove_root;
+	}
+
+	tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+	if (!tmp) {
+		retval = -ENOMEM;
+		goto out_remove_root;
+	}
+
+	return 0;
+
+out_remove_root:
+	rv_remove(mdef->root_d);
+	return retval;
+}
+
+/*
+ * Available/Enable monitor shared seq functions.
+ */
+static int monitors_show(struct seq_file *m, void *p)
+{
+	struct rv_monitor_def *mon_def = p;
+
+	seq_printf(m, "%s\n", mon_def->monitor->name);
+	return 0;
+}
+
+/*
+ * Used by the seq file operations at the end of a read
+ * operation.
+ */
+static void monitors_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&rv_interface_lock);
+}
+
+/*
+ * Available monitor seq functions.
+ */
+static void *available_monitors_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&rv_interface_lock);
+	return seq_list_start(&rv_monitors_list, *pos);
+}
+
+static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &rv_monitors_list, pos);
+}
+
+/*
+ * Enable monitor seq functions.
+ */
+static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct rv_monitor_def *m_def = p;
+
+	(*pos)++;
+
+	list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
+		if (m_def->monitor->enabled)
+			return m_def;
+	}
+
+	return NULL;
+}
+
+static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
+{
+	struct rv_monitor_def *m_def;
+	loff_t l;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (list_empty(&rv_monitors_list))
+		return NULL;
+
+	m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+
+	for (l = 0; l <= *pos; ) {
+		m_def = enabled_monitors_next(m, m_def, &l);
+		if (!m_def)
+			break;
+	}
+
+	return m_def;
+}
+
+/*
+ * available/enabled monitors seq definition.
+ */
+static const struct seq_operations available_monitors_seq_ops = {
+	.start	= available_monitors_start,
+	.next	= available_monitors_next,
+	.stop	= monitors_stop,
+	.show	= monitors_show
+};
+
+static const struct seq_operations enabled_monitors_seq_ops = {
+	.start  = enabled_monitors_start,
+	.next   = enabled_monitors_next,
+	.stop   = monitors_stop,
+	.show   = monitors_show
+};
+
+/*
+ * available_monitors interface.
+ */
+static int available_monitors_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &available_monitors_seq_ops);
+};
+
+static const struct file_operations available_monitors_ops = {
+	.open    = available_monitors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * enabled_monitors interface.
+ */
+static void disable_all_monitors(void)
+{
+	struct rv_monitor_def *mdef;
+	int enabled = 0;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry(mdef, &rv_monitors_list, list)
+		enabled += __rv_disable_monitor(mdef, false);
+
+	if (enabled) {
+		/*
+		 * Wait for the execution of all events to finish.
+		 * Otherwise, the data used by the monitor could
+		 * be inconsistent. i.e., if the monitor is re-enabled.
+		 */
+		tracepoint_synchronize_unregister();
+	}
+
+	mutex_unlock(&rv_interface_lock);
+}
+
+static int enabled_monitors_open(struct inode *inode, struct file *file)
+{
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+		disable_all_monitors();
+
+	return seq_open(file, &enabled_monitors_seq_ops);
+};
+
+static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
+	struct rv_monitor_def *mdef;
+	int retval = -EINVAL;
+	bool enable = true;
+	char *ptr = buff;
+	int len;
+
+	if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
+		return -EINVAL;
+
+	memset(buff, 0, sizeof(buff));
+
+	retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+	if (retval < 0)
+		return -EFAULT;
+
+	ptr = strim(buff);
+
+	if (ptr[0] == '!') {
+		enable = false;
+		ptr++;
+	}
+
+	len = strlen(ptr);
+	if (!len)
+		return count;
+
+	mutex_lock(&rv_interface_lock);
+
+	retval = -EINVAL;
+
+	list_for_each_entry(mdef, &rv_monitors_list, list) {
+		if (strcmp(ptr, mdef->monitor->name) != 0)
+			continue;
+
+		/*
+		 * Monitor found!
+		 */
+		if (enable)
+			retval = rv_enable_monitor(mdef);
+		else
+			retval = rv_disable_monitor(mdef);
+
+		if (!retval)
+			retval = count;
+
+		break;
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+static const struct file_operations enabled_monitors_ops = {
+	.open		= enabled_monitors_open,
+	.read		= seq_read,
+	.write		= enabled_monitors_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+/*
+ * Monitoring on global switcher!
+ */
+static bool __read_mostly monitoring_on;
+
+/**
+ * rv_monitoring_on - checks if monitoring is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_monitoring_on(void)
+{
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_rmb();
+	return READ_ONCE(monitoring_on);
+}
+
+/*
+ * monitoring_on general switcher.
+ */
+static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
+				       size_t count, loff_t *ppos)
+{
+	const char *buff;
+
+	buff = rv_monitoring_on() ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
+}
+
+static void turn_monitoring_off(void)
+{
+	WRITE_ONCE(monitoring_on, false);
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_wmb();
+}
+
+static void reset_all_monitors(void)
+{
+	struct rv_monitor_def *mdef;
+
+	list_for_each_entry(mdef, &rv_monitors_list, list) {
+		if (mdef->monitor->enabled)
+			mdef->monitor->reset();
+	}
+}
+
+static void turn_monitoring_on(void)
+{
+	WRITE_ONCE(monitoring_on, true);
+	/* Ensures that concurrent monitors read consistent monitoring_on */
+	smp_wmb();
+}
+
+static void turn_monitoring_on_with_reset(void)
+{
+	lockdep_assert_held(&rv_interface_lock);
+
+	if (rv_monitoring_on())
+		return;
+
+	/*
+	 * Monitors might be out of sync with the system if events were not
+	 * processed because of !rv_monitoring_on().
+	 *
+	 * Reset all monitors, forcing a re-sync.
+	 */
+	reset_all_monitors();
+	turn_monitoring_on();
+}
+
+static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf,
+					size_t count, loff_t *ppos)
+{
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		turn_monitoring_on_with_reset();
+	else
+		turn_monitoring_off();
+
+	/*
+	 * Wait for the execution of all events to finish
+	 * before returning to user-space.
+	 */
+	tracepoint_synchronize_unregister();
+
+	mutex_unlock(&rv_interface_lock);
+
+	return count;
+}
+
+static const struct file_operations monitoring_on_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = monitoring_on_write_data,
+	.read   = monitoring_on_read_data,
+};
+
+static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+{
+	rv_remove(mdef->root_d);
+}
+
+/**
+ * rv_register_monitor - register a rv monitor.
+ * @monitor:    The rv_monitor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_monitor(struct rv_monitor *monitor)
+{
+	struct rv_monitor_def *r;
+	int retval = 0;
+
+	if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
+		pr_info("Monitor %s has a name longer than %d\n", monitor->name,
+			MAX_RV_MONITOR_NAME_SIZE);
+		return -1;
+	}
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry(r, &rv_monitors_list, list) {
+		if (strcmp(monitor->name, r->monitor->name) == 0) {
+			pr_info("Monitor %s is already registered\n", monitor->name);
+			retval = -1;
+			goto out_unlock;
+		}
+	}
+
+	r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
+	if (!r) {
+		retval = -ENOMEM;
+		goto out_unlock;
+	}
+
+	r->monitor = monitor;
+
+	retval = create_monitor_dir(r);
+	if (retval) {
+		kfree(r);
+		goto out_unlock;
+	}
+
+	list_add_tail(&r->list, &rv_monitors_list);
+
+out_unlock:
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+/**
+ * rv_unregister_monitor - unregister a rv monitor.
+ * @monitor:    The rv_monitor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_monitor(struct rv_monitor *monitor)
+{
+	struct rv_monitor_def *ptr, *next;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
+		if (strcmp(monitor->name, ptr->monitor->name) == 0) {
+			rv_disable_monitor(ptr);
+			list_del(&ptr->list);
+			destroy_monitor_dir(ptr);
+		}
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return 0;
+}
+
+int __init rv_init_interface(void)
+{
+	struct dentry *tmp;
+
+	rv_root.root_dir = rv_create_dir("rv", NULL);
+	if (!rv_root.root_dir)
+		goto out_err;
+
+	rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir);
+	if (!rv_root.monitors_dir)
+		goto out_err;
+
+	tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL,
+			     &available_monitors_ops);
+	if (!tmp)
+		goto out_err;
+
+	tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL,
+			     &enabled_monitors_ops);
+	if (!tmp)
+		goto out_err;
+
+	tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL,
+			     &monitoring_on_fops);
+	if (!tmp)
+		goto out_err;
+
+	turn_monitoring_on();
+
+	return 0;
+
+out_err:
+	rv_remove(rv_root.root_dir);
+	printk(KERN_ERR "RV: Error while creating the RV interface\n");
+	return 1;
+}
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
new file mode 100644
index 000000000000..50014aa224a7
--- /dev/null
+++ b/kernel/trace/rv/rv.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/mutex.h>
+
+struct rv_interface {
+	struct dentry		*root_dir;
+	struct dentry		*monitors_dir;
+};
+
+#include "../trace.h"
+#include <linux/tracefs.h>
+#include <linux/rv.h>
+
+#define RV_MODE_WRITE			TRACE_MODE_WRITE
+#define RV_MODE_READ			TRACE_MODE_READ
+
+#define rv_create_dir			tracefs_create_dir
+#define rv_create_file			tracefs_create_file
+#define rv_remove			tracefs_remove
+
+#define MAX_RV_MONITOR_NAME_SIZE	32
+
+extern struct mutex rv_interface_lock;
+
+struct rv_monitor_def {
+	struct list_head	list;
+	struct rv_monitor	*monitor;
+	struct dentry		*root_d;
+	bool			task_monitor;
+};
+
+struct dentry *get_monitors_root(void);
+int rv_disable_monitor(struct rv_monitor_def *mdef);
+int rv_enable_monitor(struct rv_monitor_def *mdef);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7eb5bce62500..301305ec134b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9772,6 +9772,8 @@ static __init int tracer_init_tracefs(void)
 		tracer_init_tracefs_work_func(NULL);
 	}
 
+	rv_init_interface();
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ff816fb41e48..900e75d96c84 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2005,4 +2005,13 @@ struct trace_min_max_param {
 
 extern const struct file_operations trace_min_max_fops;
 
+#ifdef CONFIG_RV
+extern int rv_init_interface(void);
+#else
+static inline int rv_init_interface(void)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit v1.2.3


From 04acadcb4453cf8011dd3d4ce8d97fecac42d325 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:41 +0200
Subject: rv: Add runtime reactors interface

A runtime monitor can cause a reaction to the detection of an
exception on the model's execution. By default, the monitors have
tracing reactions, printing the monitor output via tracepoints.
But other reactions can be added (on-demand) via this interface.

The user interface resembles the kernel tracing interface and
presents these files:

"available_reactors"
  - Reading shows the available reactors, one per line.

   For example:
     # cat available_reactors
     nop
     panic
     printk

 "reacting_on"
   - It is an on/off general switch for reactors, disabling
   all reactions.

 "monitors/MONITOR/reactors"
   - List available reactors, with the select reaction for the given
   MONITOR inside []. The default one is the nop (no operation)
   reactor.
   - Writing the name of a reactor enables it to the given
   MONITOR.

   For example:
     # cat monitors/wip/reactors
     [nop]
     panic
     printk
     # echo panic > monitors/wip/reactors
     # cat monitors/wip/reactors
     nop
     [panic]
     printk

Link: https://lkml.kernel.org/r/1794eb994637457bdeaa6bad0b8263d2f7eece0c.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |  17 ++
 kernel/trace/rv/Kconfig       |  11 +
 kernel/trace/rv/Makefile      |   1 +
 kernel/trace/rv/rv.c          |   9 +
 kernel/trace/rv/rv.h          |  35 +++
 kernel/trace/rv/rv_reactors.c | 508 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 581 insertions(+)
 create mode 100644 kernel/trace/rv/rv_reactors.c

(limited to 'include/linux')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index d8fa9e8be94a..dcce56987cf7 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -24,6 +24,14 @@
 union rv_task_monitor {
 };
 
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor {
+	const char		*name;
+	const char		*description;
+	void			(*react)(char *msg);
+};
+#endif
+
 struct rv_monitor {
 	const char		*name;
 	const char		*description;
@@ -31,6 +39,9 @@ struct rv_monitor {
 	int			(*enable)(void);
 	void			(*disable)(void);
 	void			(*reset)(void);
+#ifdef CONFIG_RV_REACTORS
+	void			(*react)(char *msg);
+#endif
 };
 
 bool rv_monitoring_on(void);
@@ -39,5 +50,11 @@ int rv_register_monitor(struct rv_monitor *monitor);
 int rv_get_task_monitor_slot(void);
 void rv_put_task_monitor_slot(int slot);
 
+#ifdef CONFIG_RV_REACTORS
+bool rv_reacting_on(void);
+int rv_unregister_reactor(struct rv_reactor *reactor);
+int rv_register_reactor(struct rv_reactor *reactor);
+#endif /* CONFIG_RV_REACTORS */
+
 #endif /* CONFIG_RV */
 #endif /* _LINUX_RV_H */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 6d127cdb00dd..3eb5d48ab4f6 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -10,3 +10,14 @@ menuconfig RV
 	  theorem proving). RV works by analyzing the trace of the system's
 	  actual execution, comparing it against a formal specification of
 	  the system behavior.
+
+config RV_REACTORS
+	bool "Runtime verification reactors"
+	default y
+	depends on RV
+	help
+	  Enables the online runtime verification reactors. A runtime
+	  monitor can cause a reaction to the detection of an exception
+	  on the model's execution. By default, the monitors have
+	  tracing reactions, printing the monitor output via tracepoints,
+	  but other reactions can be added (on-demand) via this interface.
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index fd995379df67..8944274d9b41 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_RV) += rv.o
+obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 731cc961cc70..45cf64eb2600 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -353,6 +353,10 @@ static int create_monitor_dir(struct rv_monitor_def *mdef)
 		goto out_remove_root;
 	}
 
+	retval = reactor_populate_monitor(mdef);
+	if (retval)
+		goto out_remove_root;
+
 	return 0;
 
 out_remove_root:
@@ -669,6 +673,7 @@ static const struct file_operations monitoring_on_fops = {
 
 static void destroy_monitor_dir(struct rv_monitor_def *mdef)
 {
+	reactor_cleanup_monitor(mdef);
 	rv_remove(mdef->root_d);
 }
 
@@ -747,6 +752,7 @@ int rv_unregister_monitor(struct rv_monitor *monitor)
 int __init rv_init_interface(void)
 {
 	struct dentry *tmp;
+	int retval;
 
 	rv_root.root_dir = rv_create_dir("rv", NULL);
 	if (!rv_root.root_dir)
@@ -770,6 +776,9 @@ int __init rv_init_interface(void)
 			     &monitoring_on_fops);
 	if (!tmp)
 		goto out_err;
+	retval = init_rv_reactors(rv_root.root_dir);
+	if (retval)
+		goto out_err;
 
 	turn_monitoring_on();
 
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 50014aa224a7..db6cb0913dbd 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -18,16 +18,51 @@ struct rv_interface {
 #define rv_remove			tracefs_remove
 
 #define MAX_RV_MONITOR_NAME_SIZE	32
+#define MAX_RV_REACTOR_NAME_SIZE	32
 
 extern struct mutex rv_interface_lock;
 
+#ifdef CONFIG_RV_REACTORS
+struct rv_reactor_def {
+	struct list_head	list;
+	struct rv_reactor	*reactor;
+	/* protected by the monitor interface lock */
+	int			counter;
+};
+#endif
+
 struct rv_monitor_def {
 	struct list_head	list;
 	struct rv_monitor	*monitor;
 	struct dentry		*root_d;
+#ifdef CONFIG_RV_REACTORS
+	struct rv_reactor_def	*rdef;
+	bool			reacting;
+#endif
 	bool			task_monitor;
 };
 
 struct dentry *get_monitors_root(void);
 int rv_disable_monitor(struct rv_monitor_def *mdef);
 int rv_enable_monitor(struct rv_monitor_def *mdef);
+
+#ifdef CONFIG_RV_REACTORS
+int reactor_populate_monitor(struct rv_monitor_def *mdef);
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int init_rv_reactors(struct dentry *root_dir);
+#else
+static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+	return 0;
+}
+
+static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+	return;
+}
+
+static inline int init_rv_reactors(struct dentry *root_dir)
+{
+	return 0;
+}
+#endif
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
new file mode 100644
index 000000000000..a6522c196382
--- /dev/null
+++ b/kernel/trace/rv/rv_reactors.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Runtime reactor interface.
+ *
+ * A runtime monitor can cause a reaction to the detection of an
+ * exception on the model's execution. By default, the monitors have
+ * tracing reactions, printing the monitor output via tracepoints.
+ * But other reactions can be added (on-demand) via this interface.
+ *
+ * == Registering reactors ==
+ *
+ * The struct rv_reactor defines a callback function to be executed
+ * in case of a model exception happens. The callback function
+ * receives a message to be (optionally) printed before executing
+ * the reaction.
+ *
+ * A RV reactor is registered via:
+ *   int rv_register_reactor(struct rv_reactor *reactor)
+ * And unregistered via:
+ *   int rv_unregister_reactor(struct rv_reactor *reactor)
+ *
+ * These functions are exported to modules, enabling reactors to be
+ * dynamically loaded.
+ *
+ * == User interface ==
+ *
+ * The user interface resembles the kernel tracing interface and
+ * presents these files:
+ *
+ *  "available_reactors"
+ *    - List the available reactors, one per line.
+ *
+ *    For example:
+ *      # cat available_reactors
+ *      nop
+ *      panic
+ *      printk
+ *
+ *  "reacting_on"
+ *    - It is an on/off general switch for reactors, disabling
+ *    all reactions.
+ *
+ *  "monitors/MONITOR/reactors"
+ *    - List available reactors, with the select reaction for the given
+ *    MONITOR inside []. The default one is the nop (no operation)
+ *    reactor.
+ *    - Writing the name of an reactor enables it to the given
+ *    MONITOR.
+ *
+ *    For example:
+ *      # cat monitors/wip/reactors
+ *      [nop]
+ *      panic
+ *      printk
+ *      # echo panic > monitors/wip/reactors
+ *      # cat monitors/wip/reactors
+ *      nop
+ *      [panic]
+ *      printk
+ */
+
+#include <linux/slab.h>
+
+#include "rv.h"
+
+/*
+ * Interface for the reactor register.
+ */
+static LIST_HEAD(rv_reactors_list);
+
+static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+{
+	struct rv_reactor_def *r;
+
+	list_for_each_entry(r, &rv_reactors_list, list) {
+		if (strcmp(name, r->reactor->name) == 0)
+			return r;
+	}
+	return NULL;
+}
+
+/*
+ * Available reactors seq functions.
+ */
+static int reactors_show(struct seq_file *m, void *p)
+{
+	struct rv_reactor_def *rea_def = p;
+
+	seq_printf(m, "%s\n", rea_def->reactor->name);
+	return 0;
+}
+
+static void reactors_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&rv_interface_lock);
+}
+
+static void *reactors_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&rv_interface_lock);
+	return seq_list_start(&rv_reactors_list, *pos);
+}
+
+static void *reactors_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	return seq_list_next(p, &rv_reactors_list, pos);
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations available_reactors_seq_ops = {
+	.start	= reactors_start,
+	.next	= reactors_next,
+	.stop	= reactors_stop,
+	.show	= reactors_show
+};
+
+/*
+ * available_reactors interface.
+ */
+static int available_reactors_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &available_reactors_seq_ops);
+};
+
+static const struct file_operations available_reactors_ops = {
+	.open    = available_reactors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * Monitor's reactor file.
+ */
+static int monitor_reactor_show(struct seq_file *m, void *p)
+{
+	struct rv_monitor_def *mdef = m->private;
+	struct rv_reactor_def *rdef = p;
+
+	if (mdef->rdef == rdef)
+		seq_printf(m, "[%s]\n", rdef->reactor->name);
+	else
+		seq_printf(m, "%s\n", rdef->reactor->name);
+	return 0;
+}
+
+/*
+ * available_reactors seq definition.
+ */
+static const struct seq_operations monitor_reactors_seq_ops = {
+	.start	= reactors_start,
+	.next	= reactors_next,
+	.stop	= reactors_stop,
+	.show	= monitor_reactor_show
+};
+
+static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
+				    bool reacting)
+{
+	bool monitor_enabled;
+
+	/* nothing to do */
+	if (mdef->rdef == rdef)
+		return;
+
+	monitor_enabled = mdef->monitor->enabled;
+	if (monitor_enabled)
+		rv_disable_monitor(mdef);
+
+	/* swap reactor's usage */
+	mdef->rdef->counter--;
+	rdef->counter++;
+
+	mdef->rdef = rdef;
+	mdef->reacting = reacting;
+	mdef->monitor->react = rdef->reactor->react;
+
+	if (monitor_enabled)
+		rv_enable_monitor(mdef);
+}
+
+static ssize_t
+monitor_reactors_write(struct file *file, const char __user *user_buf,
+		      size_t count, loff_t *ppos)
+{
+	char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
+	struct rv_monitor_def *mdef;
+	struct rv_reactor_def *rdef;
+	struct seq_file *seq_f;
+	int retval = -EINVAL;
+	bool enable;
+	char *ptr;
+	int len;
+
+	if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1)
+		return -EINVAL;
+
+	memset(buff, 0, sizeof(buff));
+
+	retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count);
+	if (retval < 0)
+		return -EFAULT;
+
+	ptr = strim(buff);
+
+	len = strlen(ptr);
+	if (!len)
+		return count;
+
+	/*
+	 * See monitor_reactors_open()
+	 */
+	seq_f = file->private_data;
+	mdef = seq_f->private;
+
+	mutex_lock(&rv_interface_lock);
+
+	retval = -EINVAL;
+
+	list_for_each_entry(rdef, &rv_reactors_list, list) {
+		if (strcmp(ptr, rdef->reactor->name) != 0)
+			continue;
+
+		if (rdef == get_reactor_rdef_by_name("nop"))
+			enable = false;
+		else
+			enable = true;
+
+		monitor_swap_reactors(mdef, rdef, enable);
+
+		retval = count;
+		break;
+	}
+
+	mutex_unlock(&rv_interface_lock);
+
+	return retval;
+}
+
+/*
+ * available_reactors interface.
+ */
+static int monitor_reactors_open(struct inode *inode, struct file *file)
+{
+	struct rv_monitor_def *mdef = inode->i_private;
+	struct seq_file *seq_f;
+	int ret;
+
+	ret = seq_open(file, &monitor_reactors_seq_ops);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * seq_open stores the seq_file on the file->private data.
+	 */
+	seq_f = file->private_data;
+
+	/*
+	 * Copy the create file "private" data to the seq_file private data.
+	 */
+	seq_f->private = mdef;
+
+	return 0;
+};
+
+static const struct file_operations monitor_reactors_ops = {
+	.open    = monitor_reactors_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write = monitor_reactors_write
+};
+
+static int __rv_register_reactor(struct rv_reactor *reactor)
+{
+	struct rv_reactor_def *r;
+
+	list_for_each_entry(r, &rv_reactors_list, list) {
+		if (strcmp(reactor->name, r->reactor->name) == 0) {
+			pr_info("Reactor %s is already registered\n", reactor->name);
+			return -EINVAL;
+		}
+	}
+
+	r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
+	if (!r)
+		return -ENOMEM;
+
+	r->reactor = reactor;
+	r->counter = 0;
+
+	list_add_tail(&r->list, &rv_reactors_list);
+
+	return 0;
+}
+
+/**
+ * rv_register_reactor - register a rv reactor.
+ * @reactor:	The rv_reactor to be registered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_register_reactor(struct rv_reactor *reactor)
+{
+	int retval = 0;
+
+	if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) {
+		pr_info("Reactor %s has a name longer than %d\n",
+			reactor->name, MAX_RV_MONITOR_NAME_SIZE);
+		return -EINVAL;
+	}
+
+	mutex_lock(&rv_interface_lock);
+	retval = __rv_register_reactor(reactor);
+	mutex_unlock(&rv_interface_lock);
+	return retval;
+}
+
+/**
+ * rv_unregister_reactor - unregister a rv reactor.
+ * @reactor:	The rv_reactor to be unregistered.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int rv_unregister_reactor(struct rv_reactor *reactor)
+{
+	struct rv_reactor_def *ptr, *next;
+
+	mutex_lock(&rv_interface_lock);
+
+	list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
+		if (strcmp(reactor->name, ptr->reactor->name) == 0) {
+
+			if (!ptr->counter) {
+				list_del(&ptr->list);
+			} else {
+				printk(KERN_WARNING
+				       "rv: the rv_reactor %s is in use by %d monitor(s)\n",
+				       ptr->reactor->name, ptr->counter);
+				printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
+				       ptr->reactor->name);
+				return -EBUSY;
+			}
+		}
+	}
+
+	mutex_unlock(&rv_interface_lock);
+	return 0;
+}
+
+/*
+ * reacting_on interface.
+ */
+static bool __read_mostly reacting_on;
+
+/**
+ * rv_reacting_on - checks if reacting is on
+ *
+ * Returns 1 if on, 0 otherwise.
+ */
+bool rv_reacting_on(void)
+{
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_rmb();
+	return READ_ONCE(reacting_on);
+}
+
+static ssize_t reacting_on_read_data(struct file *filp,
+				     char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	char *buff;
+
+	buff = rv_reacting_on() ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
+}
+
+static void turn_reacting_off(void)
+{
+	WRITE_ONCE(reacting_on, false);
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_wmb();
+}
+
+static void turn_reacting_on(void)
+{
+	WRITE_ONCE(reacting_on, true);
+	/* Ensures that concurrent monitors read consistent reacting_on */
+	smp_wmb();
+}
+
+static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	int retval;
+	bool val;
+
+	retval = kstrtobool_from_user(user_buf, count, &val);
+	if (retval)
+		return retval;
+
+	mutex_lock(&rv_interface_lock);
+
+	if (val)
+		turn_reacting_on();
+	else
+		turn_reacting_off();
+
+	/*
+	 * Wait for the execution of all events to finish
+	 * before returning to user-space.
+	 */
+	tracepoint_synchronize_unregister();
+
+	mutex_unlock(&rv_interface_lock);
+
+	return count;
+}
+
+static const struct file_operations reacting_on_fops = {
+	.open   = simple_open,
+	.llseek = no_llseek,
+	.write  = reacting_on_write_data,
+	.read   = reacting_on_read_data,
+};
+
+/**
+ * reactor_populate_monitor - creates per monitor reactors file
+ * @mdef:	monitor's definition.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int reactor_populate_monitor(struct rv_monitor_def *mdef)
+{
+	struct dentry *tmp;
+
+	tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+	if (!tmp)
+		return -ENOMEM;
+
+	/*
+	 * Configure as the rv_nop reactor.
+	 */
+	mdef->rdef = get_reactor_rdef_by_name("nop");
+	mdef->rdef->counter++;
+	mdef->reacting = false;
+
+	return 0;
+}
+
+/**
+ * reactor_cleanup_monitor - cleanup a monitor reference
+ * @mdef:       monitor's definition.
+ */
+void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+{
+	lockdep_assert_held(&rv_interface_lock);
+	mdef->rdef->counter--;
+	WARN_ON_ONCE(mdef->rdef->counter < 0);
+}
+
+/*
+ * Nop reactor register
+ */
+static void rv_nop_reaction(char *msg)
+{
+}
+
+static struct rv_reactor rv_nop = {
+	.name = "nop",
+	.description = "no-operation reactor: do nothing.",
+	.react = rv_nop_reaction
+};
+
+int init_rv_reactors(struct dentry *root_dir)
+{
+	struct dentry *available, *reacting;
+	int retval;
+
+	available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL,
+				   &available_reactors_ops);
+	if (!available)
+		goto out_err;
+
+	reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops);
+	if (!reacting)
+		goto rm_available;
+
+	retval = __rv_register_reactor(&rv_nop);
+	if (retval)
+		goto rm_reacting;
+
+	turn_reacting_on();
+
+	return 0;
+
+rm_reacting:
+	rv_remove(reacting);
+rm_available:
+	rv_remove(available);
+out_err:
+	return -ENOMEM;
+}
-- 
cgit v1.2.3


From 792575348ff70e05c6040d02fce38e949ef92c37 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Fri, 29 Jul 2022 11:38:43 +0200
Subject: rv/include: Add deterministic automata monitor definition via C
 macros

In Linux terms, the runtime verification monitors are encapsulated
inside the "RV monitor" abstraction. The "RV monitor" includes a set
of instances of the monitor (per-cpu monitor, per-task monitor, and
so on), the helper functions that glue the monitor to the system
reference model, and the trace output as a reaction for event parsing
and exceptions, as depicted below:

Linux  +----- RV Monitor ----------------------------------+ Formal
 Realm |                                                   |  Realm
 +-------------------+     +----------------+     +-----------------+
 |   Linux kernel    |     |     Monitor    |     |     Reference   |
 |     Tracing       |  -> |   Instance(s)  | <-  |       Model     |
 | (instrumentation) |     | (verification) |     | (specification) |
 +-------------------+     +----------------+     +-----------------+
        |                          |                       |
        |                          V                       |
        |                     +----------+                 |
        |                     | Reaction |                 |
        |                     +--+--+--+-+                 |
        |                        |  |  |                   |
        |                        |  |  +-> trace output ?  |
        +------------------------|--|----------------------+
                                 |  +----> panic ?
                                 +-------> <user-specified>

Add the rv/da_monitor.h, enabling automatic code generation for the
*Monitor Instance(s)* using C macros, and code to support it.

The benefits of the usage of macro for monitor synthesis are 3-fold as it:

- Reduces the code duplication;
- Facilitates the bug fix/improvement;
- Avoids the case of developers changing the core of the monitor code
  to manipulate the model in a (let's say) non-standard way.

This initial implementation presents three different types of monitor
instances:

- DECLARE_DA_MON_GLOBAL(name, type)
- DECLARE_DA_MON_PER_CPU(name, type)
- DECLARE_DA_MON_PER_TASK(name, type)

The first declares the functions for a global deterministic automata monitor,
the second for monitors with per-cpu instances, and the third with per-task
instances.

Link: https://lkml.kernel.org/r/51b0bf425a281e226dfeba7401d2115d6091f84e.1659052063.git.bristot@kernel.org

Cc: Wim Van Sebroeck <wim@linux-watchdog.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Tao Zhou <tao.zhou@linux.dev>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-devel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h        |  10 +
 include/rv/da_monitor.h   | 541 ++++++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/rv.h | 120 ++++++++++
 kernel/fork.c             |  14 ++
 kernel/trace/rv/Kconfig   |  11 +
 kernel/trace/rv/rv.c      |   5 +
 6 files changed, 701 insertions(+)
 create mode 100644 include/rv/da_monitor.h
 create mode 100644 include/trace/events/rv.h

(limited to 'include/linux')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index dcce56987cf7..8883b41d88ec 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,16 @@
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
+#define MAX_DA_NAME_LEN	24
+
 #ifdef CONFIG_RV
+/*
+ * Deterministic automaton per-object variables.
+ */
+struct da_monitor {
+	bool		monitoring;
+	unsigned int	curr_state;
+};
 
 /*
  * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
@@ -22,6 +31,7 @@
  * Futher monitor types are expected, so make this a union.
  */
 union rv_task_monitor {
+	struct da_monitor da_mon;
 };
 
 #ifdef CONFIG_RV_REACTORS
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
new file mode 100644
index 000000000000..001bc298289f
--- /dev/null
+++ b/include/rv/da_monitor.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+ *
+ * Deterministic automata (DA) monitor functions, to be used together
+ * with automata models in C generated by the dot2k tool.
+ *
+ * The dot2k tool is available at tools/verification/dot2k/
+ */
+
+#include <rv/automata.h>
+#include <linux/rv.h>
+#include <linux/bug.h>
+
+#ifdef CONFIG_RV_REACTORS
+
+#define DECLARE_RV_REACTING_HELPERS(name, type)							\
+static char REACT_MSG_##name[1024];								\
+												\
+static inline char *format_react_msg_##name(type curr_state, type event)			\
+{												\
+	snprintf(REACT_MSG_##name, 1024,							\
+		 "rv: monitor %s does not allow event %s on state %s\n",			\
+		 #name,										\
+		 model_get_event_name_##name(event),						\
+		 model_get_state_name_##name(curr_state));					\
+	return REACT_MSG_##name;								\
+}												\
+												\
+static void cond_react_##name(char *msg)							\
+{												\
+	if (rv_##name.react)									\
+		rv_##name.react(msg);								\
+}												\
+												\
+static bool rv_reacting_on_##name(void)								\
+{												\
+	return rv_reacting_on();								\
+}
+
+#else /* CONFIG_RV_REACTOR */
+
+#define DECLARE_RV_REACTING_HELPERS(name, type)							\
+static inline char *format_react_msg_##name(type curr_state, type event)			\
+{												\
+	return NULL;										\
+}												\
+												\
+static void cond_react_##name(char *msg)							\
+{												\
+	return;											\
+}												\
+												\
+static bool rv_reacting_on_##name(void)								\
+{												\
+	return 0;										\
+}
+#endif
+
+/*
+ * Generic helpers for all types of deterministic automata monitors.
+ */
+#define DECLARE_DA_MON_GENERIC_HELPERS(name, type)						\
+												\
+DECLARE_RV_REACTING_HELPERS(name, type)								\
+												\
+/*												\
+ * da_monitor_reset_##name - reset a monitor and setting it to init state			\
+ */												\
+static inline void da_monitor_reset_##name(struct da_monitor *da_mon)				\
+{												\
+	da_mon->monitoring = 0;									\
+	da_mon->curr_state = model_get_initial_state_##name();					\
+}												\
+												\
+/*												\
+ * da_monitor_curr_state_##name - return the current state					\
+ */												\
+static inline type da_monitor_curr_state_##name(struct da_monitor *da_mon)			\
+{												\
+	return da_mon->curr_state;								\
+}												\
+												\
+/*												\
+ * da_monitor_set_state_##name - set the new current state					\
+ */												\
+static inline void										\
+da_monitor_set_state_##name(struct da_monitor *da_mon, enum states_##name state)		\
+{												\
+	da_mon->curr_state = state;								\
+}												\
+												\
+/*												\
+ * da_monitor_start_##name - start monitoring							\
+ *												\
+ * The monitor will ignore all events until monitoring is set to true. This			\
+ * function needs to be called to tell the monitor to start monitoring.				\
+ */												\
+static inline void da_monitor_start_##name(struct da_monitor *da_mon)				\
+{												\
+	da_mon->curr_state = model_get_initial_state_##name();					\
+	da_mon->monitoring = 1;									\
+}												\
+												\
+/*												\
+ * da_monitoring_##name - returns true if the monitor is processing events			\
+ */												\
+static inline bool da_monitoring_##name(struct da_monitor *da_mon)				\
+{												\
+	return da_mon->monitoring;								\
+}												\
+												\
+/*												\
+ * da_monitor_enabled_##name - checks if the monitor is enabled					\
+ */												\
+static inline bool da_monitor_enabled_##name(void)						\
+{												\
+	/* global switch */									\
+	if (unlikely(!rv_monitoring_on()))							\
+		return 0;									\
+												\
+	/* monitor enabled */									\
+	if (unlikely(!rv_##name.enabled))							\
+		return 0;									\
+												\
+	return 1;										\
+}												\
+												\
+/*												\
+ * da_monitor_handling_event_##name - checks if the monitor is ready to handle events		\
+ */												\
+static inline bool da_monitor_handling_event_##name(struct da_monitor *da_mon)			\
+{												\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	/* monitor is actually monitoring */							\
+	if (unlikely(!da_monitoring_##name(da_mon)))						\
+		return 0;									\
+												\
+	return 1;										\
+}
+
+/*
+ * Event handler for implicit monitors. Implicit monitor is the one which the
+ * handler does not need to specify which da_monitor to manipulate. Examples
+ * of implicit monitor are the per_cpu or the global ones.
+ */
+#define DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)					\
+												\
+static inline bool										\
+da_event_##name(struct da_monitor *da_mon, enum events_##name event)				\
+{												\
+	type curr_state = da_monitor_curr_state_##name(da_mon);					\
+	type next_state = model_get_next_state_##name(curr_state, event);			\
+												\
+	if (next_state != INVALID_STATE) {							\
+		da_monitor_set_state_##name(da_mon, next_state);				\
+												\
+		trace_event_##name(model_get_state_name_##name(curr_state),			\
+				   model_get_event_name_##name(event),				\
+				   model_get_state_name_##name(next_state),			\
+				   model_is_final_state_##name(next_state));			\
+												\
+		return true;									\
+	}											\
+												\
+	if (rv_reacting_on_##name())								\
+		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+												\
+	trace_error_##name(model_get_state_name_##name(curr_state),				\
+			   model_get_event_name_##name(event));					\
+												\
+	return false;										\
+}												\
+
+/*
+ * Event handler for per_task monitors.
+ */
+#define DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type)					\
+												\
+static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct *tsk,		\
+				   enum events_##name event)					\
+{												\
+	type curr_state = da_monitor_curr_state_##name(da_mon);					\
+	type next_state = model_get_next_state_##name(curr_state, event);			\
+												\
+	if (next_state != INVALID_STATE) {							\
+		da_monitor_set_state_##name(da_mon, next_state);				\
+												\
+		trace_event_##name(tsk->pid,							\
+				   model_get_state_name_##name(curr_state),			\
+				   model_get_event_name_##name(event),				\
+				   model_get_state_name_##name(next_state),			\
+				   model_is_final_state_##name(next_state));			\
+												\
+		return true;									\
+	}											\
+												\
+	if (rv_reacting_on_##name())								\
+		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+												\
+	trace_error_##name(tsk->pid,								\
+			   model_get_state_name_##name(curr_state),				\
+			   model_get_event_name_##name(event));					\
+												\
+	return false;										\
+}
+
+/*
+ * Functions to define, init and get a global monitor.
+ */
+#define DECLARE_DA_MON_INIT_GLOBAL(name, type)							\
+												\
+/*												\
+ * global monitor (a single variable)								\
+ */												\
+static struct da_monitor da_mon_##name;								\
+												\
+/*												\
+ * da_get_monitor_##name - return the global monitor address					\
+ */												\
+static struct da_monitor *da_get_monitor_##name(void)						\
+{												\
+	return &da_mon_##name;									\
+}												\
+												\
+/*												\
+ * da_monitor_reset_all_##name - reset the single monitor					\
+ */												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	da_monitor_reset_##name(da_get_monitor_##name());					\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize a monitor						\
+ */												\
+static inline int da_monitor_init_##name(void)							\
+{												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - destroy the monitor						\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	return;											\
+}
+
+/*
+ * Functions to define, init and get a per-cpu monitor.
+ */
+#define DECLARE_DA_MON_INIT_PER_CPU(name, type)							\
+												\
+/*												\
+ * per-cpu monitor variables									\
+ */												\
+DEFINE_PER_CPU(struct da_monitor, da_mon_##name);						\
+												\
+/*												\
+ * da_get_monitor_##name - return current CPU monitor address					\
+ */												\
+static struct da_monitor *da_get_monitor_##name(void)						\
+{												\
+	return this_cpu_ptr(&da_mon_##name);							\
+}												\
+												\
+/*												\
+ * da_monitor_reset_all_##name - reset all CPUs' monitor					\
+ */												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	struct da_monitor *da_mon;								\
+	int cpu;										\
+	for_each_cpu(cpu, cpu_online_mask) {							\
+		da_mon = per_cpu_ptr(&da_mon_##name, cpu);					\
+		da_monitor_reset_##name(da_mon);						\
+	}											\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize all CPUs' monitor					\
+ */												\
+static inline int da_monitor_init_##name(void)							\
+{												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - destroy the monitor						\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	return;											\
+}
+
+/*
+ * Functions to define, init and get a per-task monitor.
+ */
+#define DECLARE_DA_MON_INIT_PER_TASK(name, type)						\
+												\
+/*												\
+ * The per-task monitor is stored a vector in the task struct. This variable			\
+ * stores the position on the vector reserved for this monitor.					\
+ */												\
+static int task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT;					\
+												\
+/*												\
+ * da_get_monitor_##name - return the monitor in the allocated slot for tsk 			\
+ */												\
+static inline struct da_monitor *da_get_monitor_##name(struct task_struct *tsk)			\
+{												\
+	return &tsk->rv[task_mon_slot_##name].da_mon;						\
+}												\
+												\
+static void da_monitor_reset_all_##name(void)							\
+{												\
+	struct task_struct *g, *p;								\
+												\
+	read_lock(&tasklist_lock);								\
+	for_each_process_thread(g, p)								\
+		da_monitor_reset_##name(da_get_monitor_##name(p));				\
+	read_unlock(&tasklist_lock);								\
+}												\
+												\
+/*												\
+ * da_monitor_init_##name - initialize the per-task monitor					\
+ *												\
+ * Try to allocate a slot in the task's vector of monitors. If there				\
+ * is an available slot, use it and reset all task's monitor.					\
+ */												\
+static int da_monitor_init_##name(void)								\
+{												\
+	int slot;										\
+												\
+	slot = rv_get_task_monitor_slot();							\
+	if (slot < 0 || slot >= RV_PER_TASK_MONITOR_INIT)					\
+		return slot;									\
+												\
+	task_mon_slot_##name = slot;								\
+												\
+	da_monitor_reset_all_##name();								\
+	return 0;										\
+}												\
+												\
+/*												\
+ * da_monitor_destroy_##name - return the allocated slot					\
+ */												\
+static inline void da_monitor_destroy_##name(void)						\
+{												\
+	if (task_mon_slot_##name == RV_PER_TASK_MONITOR_INIT) {					\
+		WARN_ONCE(1, "Disabling a disabled monitor: " #name);				\
+		return;										\
+	}											\
+	rv_put_task_monitor_slot(task_mon_slot_##name);						\
+	task_mon_slot_##name = RV_PER_TASK_MONITOR_INIT;					\
+	return;											\
+}
+
+/*
+ * Handle event for implicit monitor: da_get_monitor_##name() will figure out
+ * the monitor.
+ */
+#define DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)					\
+												\
+static inline void __da_handle_event_##name(struct da_monitor *da_mon,				\
+					    enum events_##name event)				\
+{												\
+	bool retval;										\
+												\
+	retval = da_event_##name(da_mon, event);						\
+	if (!retval)										\
+		da_monitor_reset_##name(da_mon);						\
+}												\
+												\
+/*												\
+ * da_handle_event_##name - handle an event							\
+ */												\
+static inline void da_handle_event_##name(enum events_##name event)				\
+{												\
+	struct da_monitor *da_mon = da_get_monitor_##name();					\
+	bool retval;										\
+												\
+	retval = da_monitor_handling_event_##name(da_mon);					\
+	if (!retval)										\
+		return;										\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+}												\
+												\
+/*												\
+ * da_handle_start_event_##name - start monitoring or handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is returning			\
+ * to the initial state, so the monitor can start monitoring in the next event.			\
+ * Thus:											\
+ *												\
+ * If the monitor already started, handle the event.						\
+ * If the monitor did not start yet, start the monitor but skip the event.			\
+ */												\
+static inline bool da_handle_start_event_##name(enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name();							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon))) {						\
+		da_monitor_start_##name(da_mon);						\
+		return 0;									\
+	}											\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+												\
+	return 1;										\
+}												\
+												\
+/*												\
+ * da_handle_start_run_event_##name - start monitoring and handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is in the			\
+ * initial state, so the monitor can start monitoring and handling event.			\
+ */												\
+static inline bool da_handle_start_run_event_##name(enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name();							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon)))						\
+		da_monitor_start_##name(da_mon);						\
+												\
+	__da_handle_event_##name(da_mon, event);						\
+												\
+	return 1;										\
+}
+
+/*
+ * Handle event for per task.
+ */
+#define DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type)					\
+												\
+static inline void										\
+__da_handle_event_##name(struct da_monitor *da_mon, struct task_struct *tsk,			\
+			 enum events_##name event)						\
+{												\
+	bool retval;										\
+												\
+	retval = da_event_##name(da_mon, tsk, event);						\
+	if (!retval)										\
+		da_monitor_reset_##name(da_mon);						\
+}												\
+												\
+/*												\
+ * da_handle_event_##name - handle an event							\
+ */												\
+static inline void										\
+da_handle_event_##name(struct task_struct *tsk, enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon = da_get_monitor_##name(tsk);					\
+	bool retval;										\
+												\
+	retval = da_monitor_handling_event_##name(da_mon);					\
+	if (!retval)										\
+		return;										\
+												\
+	__da_handle_event_##name(da_mon, tsk, event);						\
+}												\
+												\
+/*												\
+ * da_handle_start_event_##name - start monitoring or handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is returning			\
+ * to the initial state, so the monitor can start monitoring in the next event.			\
+ * Thus:											\
+ *												\
+ * If the monitor already started, handle the event.						\
+ * If the monitor did not start yet, start the monitor but skip the event.			\
+ */												\
+static inline bool										\
+da_handle_start_event_##name(struct task_struct *tsk, enum events_##name event)			\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name(tsk);							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon))) {						\
+		da_monitor_start_##name(da_mon);						\
+		return 0;									\
+	}											\
+												\
+	__da_handle_event_##name(da_mon, tsk, event);						\
+												\
+	return 1;										\
+}
+
+/*
+ * Entry point for the global monitor.
+ */
+#define DECLARE_DA_MON_GLOBAL(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)						\
+DECLARE_DA_MON_INIT_GLOBAL(name, type)								\
+DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)
+
+/*
+ * Entry point for the per-cpu monitor.
+ */
+#define DECLARE_DA_MON_PER_CPU(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)						\
+DECLARE_DA_MON_INIT_PER_CPU(name, type)								\
+DECLARE_DA_MON_MONITOR_HANDLER_IMPLICIT(name, type)
+
+/*
+ * Entry point for the per-task monitor.
+ */
+#define DECLARE_DA_MON_PER_TASK(name, type)							\
+												\
+DECLARE_AUTOMATA_HELPERS(name, type)								\
+DECLARE_DA_MON_GENERIC_HELPERS(name, type)							\
+DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type)						\
+DECLARE_DA_MON_INIT_PER_TASK(name, type)							\
+DECLARE_DA_MON_MONITOR_HANDLER_PER_TASK(name, type)
diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h
new file mode 100644
index 000000000000..20a2e09c6416
--- /dev/null
+++ b/include/trace/events/rv.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+	TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+	TP_ARGS(state, event, next_state, final_state),
+
+	TP_STRUCT__entry(
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+		__array(	char,	next_state,	MAX_DA_NAME_LEN	)
+		__field(	bool,	final_state			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		memcpy(__entry->next_state,	next_state,	MAX_DA_NAME_LEN);
+		__entry->final_state		= final_state;
+	),
+
+	TP_printk("%s x %s -> %s %s",
+		__entry->state,
+		__entry->event,
+		__entry->next_state,
+		__entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+	TP_PROTO(char *state, char *event),
+
+	TP_ARGS(state, event),
+
+	TP_STRUCT__entry(
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+	),
+
+	TP_printk("event %s not expected in the state %s",
+		__entry->event,
+		__entry->state)
+);
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+	TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+	TP_ARGS(id, state, event, next_state, final_state),
+
+	TP_STRUCT__entry(
+		__field(	int,	id				)
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+		__array(	char,	next_state,	MAX_DA_NAME_LEN	)
+		__field(	bool,	final_state			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		memcpy(__entry->next_state,	next_state,	MAX_DA_NAME_LEN);
+		__entry->id			= id;
+		__entry->final_state		= final_state;
+	),
+
+	TP_printk("%d: %s x %s -> %s %s",
+		__entry->id,
+		__entry->state,
+		__entry->event,
+		__entry->next_state,
+		__entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+	TP_PROTO(int id, char *state, char *event),
+
+	TP_ARGS(id, state, event),
+
+	TP_STRUCT__entry(
+		__field(	int,	id				)
+		__array(	char,	state,		MAX_DA_NAME_LEN	)
+		__array(	char,	event,		MAX_DA_NAME_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->state,		state,		MAX_DA_NAME_LEN);
+		memcpy(__entry->event,		event,		MAX_DA_NAME_LEN);
+		__entry->id			= id;
+	),
+
+	TP_printk("%d: event %s not expected in the state %s",
+		__entry->id,
+		__entry->event,
+		__entry->state)
+);
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#endif /* _TRACE_RV_H */
+
+/* This part ust be outside protection */
+#undef TRACE_INCLUDE_PATH
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..6f1f82ccd5f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1964,6 +1964,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
 	mutex_unlock(&oom_adj_mutex);
 }
 
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+	int i;
+
+	for (i = 0; i < RV_PER_TASK_MONITORS; i++)
+		p->rv[i].da_mon.monitoring = false;
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2399,6 +2411,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	rv_task_fork(p);
+
 	rseq_fork(p, clone_flags);
 
 	/* Don't start children in a dying pid namespace */
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3eb5d48ab4f6..8714800e22ad 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -1,5 +1,16 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
+config DA_MON_EVENTS
+	bool
+
+config DA_MON_EVENTS_IMPLICIT
+	select DA_MON_EVENTS
+	bool
+
+config DA_MON_EVENTS_ID
+	select DA_MON_EVENTS
+	bool
+
 menuconfig RV
 	bool "Runtime Verification"
 	depends on TRACING
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 45cf64eb2600..df6678c86334 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -140,6 +140,11 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 
+#ifdef CONFIG_DA_MON_EVENTS
+#define CREATE_TRACE_POINTS
+#include <trace/events/rv.h>
+#endif
+
 #include "rv.h"
 
 DEFINE_MUTEX(rv_interface_lock);
-- 
cgit v1.2.3


From 36d4b36b69590fed99356a4426c940a253a93800 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Mon, 25 Jul 2022 09:39:17 -0700
Subject: lib/nodemask: inline next_node_in() and node_random()

The functions are pretty thin wrappers around find_bit engine, and
keeping them in c-file prevents compiler from small_const_nbits()
optimization, which must take place for all systems with MAX_NUMNODES
less than BITS_PER_LONG (default is 16 for me).

Moving them to header file doesn't blow up the kernel size:
add/remove: 1/2 grow/shrink: 9/5 up/down: 968/-88 (880)

CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Paul Mackerras <paulus@samba.org>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 MAINTAINERS              |  1 -
 include/linux/nodemask.h | 24 +++++++++++++++++++-----
 lib/Makefile             |  2 +-
 lib/nodemask.c           |  8 --------
 4 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7c0b8f28aa25..19c8d0ef1177 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3540,7 +3540,6 @@ F:	lib/bitmap.c
 F:	lib/cpumask.c
 F:	lib/find_bit.c
 F:	lib/find_bit_benchmark.c
-F:	lib/nodemask.c
 F:	lib/test_bitmap.c
 F:	tools/include/linux/bitmap.h
 F:	tools/include/linux/find.h
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 0f233b76c9ce..4b71a96190a8 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -94,6 +94,7 @@
 #include <linux/bitmap.h>
 #include <linux/minmax.h>
 #include <linux/numa.h>
+#include <linux/random.h>
 
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
@@ -276,7 +277,14 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp)
  * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
  */
 #define next_node_in(n, src) __next_node_in((n), &(src))
-unsigned int __next_node_in(int node, const nodemask_t *srcp);
+static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
+{
+	unsigned int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
 
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
@@ -493,14 +501,20 @@ static inline int num_node_state(enum node_states state)
 
 #endif
 
+static inline int node_random(const nodemask_t *maskp)
+{
 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
-extern int node_random(const nodemask_t *maskp);
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
 #else
-static inline int node_random(const nodemask_t *mask)
-{
 	return 0;
-}
 #endif
+}
 
 #define node_online_map 	node_states[N_ONLINE]
 #define node_possible_map 	node_states[N_POSSIBLE]
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..731cea0342d1 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -33,7 +33,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
-	 nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \
+	 nmi_backtrace.o win_minmax.o memcat_p.o \
 	 buildid.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
index e22647f5181b..b8a433d16b51 100644
--- a/lib/nodemask.c
+++ b/lib/nodemask.c
@@ -3,14 +3,6 @@
 #include <linux/module.h>
 #include <linux/random.h>
 
-unsigned int __next_node_in(int node, const nodemask_t *srcp)
-{
-	unsigned int ret = __next_node(node, srcp);
-
-	if (ret == MAX_NUMNODES)
-		ret = __first_node(srcp);
-	return ret;
-}
 EXPORT_SYMBOL(__next_node_in);
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 170ab26b01d7f445c46261eff69341dab7e50a24 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Thu, 21 Jul 2022 16:19:04 +0800
Subject: tracepoints: It is CONFIG_TRACEPOINTS not CONFIG_TRACEPOINT

When reading this note, CONFIG_TRACEPOINT searches my configuration
file, and the result is CONFIG_TRACEPOINTS, the search results are
consistent with the following macro definitions. I think it should be
repaired.

Link: https://lkml.kernel.org/r/20220721081904.3798-1-zeming@nfschina.com

Signed-off-by: Li zeming <zeming@nfschina.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 28031b15f878..2908cc5ed70e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -151,7 +151,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 /*
  * Individual subsystem my have a separate configuration to
  * enable their tracepoints. By default, this file will create
- * the tracepoints if CONFIG_TRACEPOINT is defined. If a subsystem
+ * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem
  * wants to be able to disable its tracepoints from being created
  * it can define NOTRACE before including the tracepoint headers.
  */
-- 
cgit v1.2.3


From d9c26e0a58b0039d1ad4340d826fe8db866e9455 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Wed, 8 Jun 2022 22:40:55 +0800
Subject: mailbox: mtk-cmdq: Remove proprietary cmdq_task_cb

rx_callback is a standard mailbox callback mechanism and could cover the
function of proprietary cmdq_task_cb, so use the standard one instead of
the proprietary one. Client driver has changed to use standard
rx_callback, so remove proprietary cmdq_task_cb.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mtk-cmdq-mailbox.c       | 11 -----------
 include/linux/mailbox/mtk-cmdq-mailbox.h | 10 ----------
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 2578e5aaa935..9465f9081515 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -192,15 +192,10 @@ static bool cmdq_thread_is_in_wfe(struct cmdq_thread *thread)
 
 static void cmdq_task_exec_done(struct cmdq_task *task, int sta)
 {
-	struct cmdq_task_cb *cb = &task->pkt->async_cb;
 	struct cmdq_cb_data data;
 
 	data.sta = sta;
-	data.data = cb->data;
 	data.pkt = task->pkt;
-	if (cb->cb)
-		cb->cb(data);
-
 	mbox_chan_received_data(task->thread->chan, &data);
 
 	list_del(&task->list_entry);
@@ -448,7 +443,6 @@ done:
 static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 {
 	struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv;
-	struct cmdq_task_cb *cb;
 	struct cmdq_cb_data data;
 	struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev);
 	struct cmdq_task *task, *tmp;
@@ -465,13 +459,8 @@ static int cmdq_mbox_flush(struct mbox_chan *chan, unsigned long timeout)
 
 	list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
 				 list_entry) {
-		cb = &task->pkt->async_cb;
 		data.sta = -ECONNABORTED;
-		data.data = cb->data;
 		data.pkt = task->pkt;
-		if (cb->cb)
-			cb->cb(data);
-
 		mbox_chan_received_data(task->thread->chan, &data);
 		list_del(&task->list_entry);
 		kfree(task);
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 44365aab043c..a8f0070c7aa9 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -67,24 +67,14 @@ enum cmdq_code {
 
 struct cmdq_cb_data {
 	int			sta;
-	void			*data;
 	struct cmdq_pkt		*pkt;
 };
 
-typedef void (*cmdq_async_flush_cb)(struct cmdq_cb_data data);
-
-struct cmdq_task_cb {
-	cmdq_async_flush_cb	cb;
-	void			*data;
-};
-
 struct cmdq_pkt {
 	void			*va_base;
 	dma_addr_t		pa_base;
 	size_t			cmd_buf_size; /* command occupied size */
 	size_t			buf_size; /* real buffer size */
-	struct cmdq_task_cb	cb;
-	struct cmdq_task_cb	async_cb;
 	void			*cl;
 };
 
-- 
cgit v1.2.3


From d3e94fdc4ef476ca1edd468cc11badf2dbbb3c00 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 8 Jan 2021 15:34:38 -0500
Subject: fscrypt: export fscrypt_fname_encrypt and
 fscrypt_fname_encrypted_size

For ceph, we want to use our own scheme for handling filenames that are
are longer than NAME_MAX after encryption and Base64 encoding. This
allows us to have a consistent view of the encrypted filenames for
clients that don't support fscrypt and clients that do but that don't
have the key.

Currently, fs/crypto only supports encrypting filenames using
fscrypt_setup_filename, but that also handles encoding nokey names. Ceph
can't use that because it handles nokey names in a different way.

Export fscrypt_fname_encrypt. Rename fscrypt_fname_encrypted_size to
__fscrypt_fname_encrypted_size and add a new wrapper called
fscrypt_fname_encrypted_size that takes an inode argument rather than a
pointer to a fscrypt_policy union.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/crypto/fname.c           | 36 ++++++++++++++++++++++++++++++------
 fs/crypto/fscrypt_private.h |  9 +++------
 fs/crypto/hooks.c           |  6 +++---
 include/linux/fscrypt.h     |  4 ++++
 4 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 14e0ef5e9a20..12bd61d20f69 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 /**
  * fscrypt_fname_encrypt() - encrypt a filename
  * @inode: inode of the parent directory (for regular filenames)
- *	   or of the symlink (for symlink targets)
+ *	   or of the symlink (for symlink targets). Key must already be
+ *	   set up.
  * @iname: the filename to encrypt
  * @out: (output) the encrypted filename
  * @olen: size of the encrypted filename.  It must be at least @iname->len.
@@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
 
 /**
  * fname_decrypt() - decrypt a filename
@@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
 	return bp - dst;
 }
 
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
-				  u32 orig_len, u32 max_len,
-				  u32 *encrypted_len_ret)
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+				    u32 orig_len, u32 max_len,
+				    u32 *encrypted_len_ret)
 {
 	int padding = 4 << (fscrypt_policy_flags(policy) &
 			    FSCRYPT_POLICY_FLAGS_PAD_MASK);
@@ -280,6 +282,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 	return true;
 }
 
+/**
+ * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
+ * @inode:		parent inode of dentry name being encrypted. Key must
+ *			already be set up.
+ * @orig_len:		length of the original filename
+ * @max_len:		maximum length to return
+ * @encrypted_len_ret:	where calculated length should be returned (on success)
+ *
+ * Filenames that are shorter than the maximum length may have their lengths
+ * increased slightly by encryption, due to padding that is applied.
+ *
+ * Return: false if the orig_len is greater than max_len. Otherwise, true and
+ *	   fill out encrypted_len_ret with the length (up to max_len).
+ */
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret)
+{
+	return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
+					      orig_len, max_len,
+					      encrypted_len_ret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
+
 /**
  * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
  * @max_encrypted_len: maximum length of encrypted filenames the buffer will be
@@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		return ret;
 
 	if (fscrypt_has_encryption_key(dir)) {
-		if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
-						  iname->len, NAME_MAX,
+		if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
 						  &fname->crypto_buf.len))
 			return -ENAMETOOLONG;
 		fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 6b4c8094cc7b..11fe9d213ae1 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
 			 const struct fscrypt_info *ci);
 
 /* fname.c */
-int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
-			  u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
-				  u32 orig_len, u32 max_len,
-				  u32 *encrypted_len_ret);
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+				    u32 orig_len, u32 max_len,
+				    u32 *encrypted_len_ret);
 
 /* hkdf.c */
-
 struct fscrypt_hkdf {
 	struct crypto_shash *hmac_tfm;
 };
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index af74599ae1cf..7c01025879b3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
 	 * counting it (even though it is meaningless for ciphertext) is simpler
 	 * for now since filesystems will assume it is there and subtract it.
 	 */
-	if (!fscrypt_fname_encrypted_size(policy, len,
-					  max_len - sizeof(struct fscrypt_symlink_data),
-					  &disk_link->len))
+	if (!__fscrypt_fname_encrypted_size(policy, len,
+					    max_len - sizeof(struct fscrypt_symlink_data),
+					    &disk_link->len))
 		return -ENAMETOOLONG;
 	disk_link->len += sizeof(struct fscrypt_symlink_data);
 
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index e60d57c99cb6..5926a4081c6d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -327,6 +327,10 @@ void fscrypt_free_inode(struct inode *inode);
 int fscrypt_drop_inode(struct inode *inode);
 
 /* fname.c */
+int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
+			  u8 *out, unsigned int olen);
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+				  u32 max_len, u32 *encrypted_len_ret);
 int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
 			   int lookup, struct fscrypt_name *fname);
 
-- 
cgit v1.2.3


From 637fa738b590ec0e3414931d1e07c4f195eb5215 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 1 Sep 2020 12:56:42 -0400
Subject: fscrypt: add fscrypt_context_for_new_inode

Most filesystems just call fscrypt_set_context on new inodes, which
usually causes a setxattr. That's a bit late for ceph, which can send
along a full set of attributes with the create request.

Doing so allows it to avoid race windows that where the new inode could
be seen by other clients without the crypto context attached. It also
avoids the separate round trip to the server.

Refactor the fscrypt code a bit to allow us to create a new crypto
context, attach it to the inode, and write it to the buffer, but without
calling set_context on it. ceph can later use this to marshal the
context into the attributes we send along with the create request.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/crypto/policy.c      | 35 +++++++++++++++++++++++++++++------
 include/linux/fscrypt.h |  1 +
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 5f858cee1e3b..a450189565e3 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -685,6 +685,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
 	return fscrypt_get_dummy_policy(dir->i_sb);
 }
 
+/**
+ * fscrypt_context_for_new_inode() - create an encryption context for a new inode
+ * @ctx: where context should be written
+ * @inode: inode from which to fetch policy and nonce
+ *
+ * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
+ * generate a new context and write it to ctx. ctx _must_ be at least
+ * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
+ *
+ * Return: size of the resulting context or a negative error code.
+ */
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
+{
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	BUILD_BUG_ON(sizeof(union fscrypt_context) !=
+			FSCRYPT_SET_CONTEXT_MAX_SIZE);
+
+	/* fscrypt_prepare_new_inode() should have set up the key already. */
+	if (WARN_ON_ONCE(!ci))
+		return -ENOKEY;
+
+	return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
+}
+EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
+
 /**
  * fscrypt_set_context() - Set the fscrypt context of a new inode
  * @inode: a new inode
@@ -701,12 +727,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
 	union fscrypt_context ctx;
 	int ctxsize;
 
-	/* fscrypt_prepare_new_inode() should have set up the key already. */
-	if (WARN_ON_ONCE(!ci))
-		return -ENOKEY;
-
-	BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
-	ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+	ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
+	if (ctxsize < 0)
+		return ctxsize;
 
 	/*
 	 * This may be the first time the inode number is available, so do any
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 5926a4081c6d..7d2f1e0f23b1 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -284,6 +284,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
 int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
 int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode);
 int fscrypt_set_context(struct inode *inode, void *fs_data);
 
 struct fscrypt_dummy_policy {
-- 
cgit v1.2.3


From 4f48d5da81ee7004a789c8aac2d0dfb2514c37f1 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 16 May 2022 11:23:19 +0800
Subject: fs/dcache: export d_same_name() helper

Compare dentry name with case-exact name, return true if names
are same, or false.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/dcache.c            | 15 +++++++++++----
 include/linux/dcache.h |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 93f4f5ee07bf..a409312ee0df 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2247,10 +2247,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 }
 EXPORT_SYMBOL(d_add_ci);
 
-
-static inline bool d_same_name(const struct dentry *dentry,
-				const struct dentry *parent,
-				const struct qstr *name)
+/**
+ * d_same_name - compare dentry name with case-exact name
+ * @parent: parent dentry
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * Return: true if names are same, or false
+ */
+bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+		 const struct qstr *name)
 {
 	if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
 		if (dentry->d_name.len != name->len)
@@ -2261,6 +2267,7 @@ static inline bool d_same_name(const struct dentry *dentry,
 				       dentry->d_name.len, dentry->d_name.name,
 				       name) == 0;
 }
+EXPORT_SYMBOL_GPL(d_same_name);
 
 /**
  * __d_lookup_rcu - search for a dentry (racy, store-free)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f5bba51480b2..bb72361834de 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
 					wait_queue_head_t *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+			const struct qstr *name);
 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
-- 
cgit v1.2.3


From d93231a6bc8a452323d5fef16cca7107ce483a27 Mon Sep 17 00:00:00 2001
From: Luís Henriques <lhenriques@suse.de>
Date: Fri, 3 Jun 2022 14:29:09 +0100
Subject: ceph: prevent a client from exceeding the MDS maximum xattr size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MDS tries to enforce a limit on the total key/values in extended
attributes.  However, this limit is enforced only if doing a synchronous
operation (MDS_OP_SETXATTR) -- if we're buffering the xattrs, the MDS
doesn't have a chance to enforce these limits.

This patch adds support for decoding the xattrs maximum size setting that is
distributed in the mdsmap.  Then, when setting an xattr, the kernel client
will revert to do a synchronous operation if that maximum size is exceeded.

While there, fix a dout() that would trigger a printk warning:

[   98.718078] ------------[ cut here ]------------
[   98.719012] precision 65536 too large
[   98.719039] WARNING: CPU: 1 PID: 3755 at lib/vsprintf.c:2703 vsnprintf+0x5e3/0x600
...

Link: https://tracker.ceph.com/issues/55725
Signed-off-by: Luís Henriques <lhenriques@suse.de>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mdsmap.c            | 22 ++++++++++++++++++----
 fs/ceph/xattr.c             | 12 ++++++++----
 include/linux/ceph/mdsmap.h |  1 +
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 30387733765d..8d0a6d2c2da4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 		__decode_and_drop_type(p, end, u8, bad_ext);
 	}
 	if (mdsmap_ev >= 8) {
-		u32 name_len;
 		/* enabled */
 		ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
-		ceph_decode_32_safe(p, end, name_len, bad_ext);
-		ceph_decode_need(p, end, name_len, bad_ext);
-		*p += name_len;
+		/* fs_name */
+		ceph_decode_skip_string(p, end, bad_ext);
 	}
 	/* damaged */
 	if (mdsmap_ev >= 9) {
@@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 	} else {
 		m->m_damaged = false;
 	}
+	if (mdsmap_ev >= 17) {
+		/* balancer */
+		ceph_decode_skip_string(p, end, bad_ext);
+		/* standby_count_wanted */
+		ceph_decode_skip_32(p, end, bad_ext);
+		/* old_max_mds */
+		ceph_decode_skip_32(p, end, bad_ext);
+		/* min_compat_client */
+		ceph_decode_skip_8(p, end, bad_ext);
+		/* required_client_features */
+		ceph_decode_skip_set(p, end, 64, bad_ext);
+		ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+	} else {
+		/* This forces the usage of the (sync) SETXATTR Op */
+		m->m_max_xattr_size = 0;
+	}
 bad_ext:
 	dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
 	     !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f141f5246163..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1086,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 			flags |= CEPH_XATTR_REMOVE;
 	}
 
-	dout("setxattr value=%.*s\n", (int)size, value);
+	dout("setxattr value size: %zu\n", size);
 
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1184,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
-	if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+	if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+	    (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+		dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+		     __func__, ci->i_xattrs.version, required_blob_size,
+		     mdsc->mdsmap->m_max_xattr_size);
 		goto do_sync;
+	}
 
 	if (!lock_snap_rwsem && !ci->i_head_snapc) {
 		lock_snap_rwsem = true;
@@ -1201,8 +1207,6 @@ retry:
 	     ceph_cap_string(issued));
 	__build_xattrs(inode);
 
-	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
 	if (!ci->i_xattrs.prealloc_blob ||
 	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
 		struct ceph_buffer *blob;
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 523fd0452856..4c3e0648dc27 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,6 +25,7 @@ struct ceph_mdsmap {
 	u32 m_session_timeout;          /* seconds */
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
+	u64 m_max_xattr_size;		/* maximum size for xattrs blob */
 	u32 m_max_mds;			/* expected up:active mds number */
 	u32 m_num_active_mds;		/* actual up:active mds number */
 	u32 possible_max_rank;		/* possible max rank index */
-- 
cgit v1.2.3


From 1b7587d69ea7b8c350195392bfd30a0f31374ae4 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Mon, 6 Jun 2022 20:15:33 +0800
Subject: ceph: fix the incorrect comment for the ceph_mds_caps struct

The incorrect comment is misleading. Acutally the last members
in ceph_mds_caps strcut is a union for none export and export
bodies.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 86bf82dbd8b8..40740e234ce1 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -768,7 +768,7 @@ struct ceph_mds_caps {
 	__le32 xattr_len;
 	__le64 xattr_version;
 
-	/* filelock */
+	/* a union of non-export and export bodies. */
 	__le64 size, max_size, truncate_size;
 	__le32 truncate_seq;
 	struct ceph_timespec mtime, atime, ctime;
-- 
cgit v1.2.3


From 020bc44a9fbf6946f42db503d11c9811f26dd9fd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 10 Jun 2022 11:40:13 -0400
Subject: ceph: switch back to testing for NULL folio->private in
 ceph_dirty_folio

Willy requested that we change this back to warning on folio->private
being non-NULl. He's trying to kill off the PG_private flag, and so we'd
like to catch where it's non-NULL.

Add a VM_WARN_ON_FOLIO (since it doesn't exist yet) and change over to
using that instead of VM_BUG_ON_FOLIO along with testing the ->private
pointer.

[ xiubli: define VM_WARN_ON_FOLIO macro in case DEBUG_VM is disabled
  reported by kernel test robot <lkp@intel.com> ]

Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c          |  2 +-
 include/linux/mmdebug.h | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 01fe75b8d146..31fc04eeb4d0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 	 * Reference snap context in folio->private.  Also set
 	 * PagePrivate so that we get invalidate_folio callback.
 	 */
-	VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
+	VM_WARN_ON_FOLIO(folio->private, folio);
 	folio_attach_private(folio, snapc);
 
 	return ceph_fscache_dirty_folio(mapping, folio);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index d7285f8148a3..15ae78cd2853 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -54,6 +54,15 @@ void dump_mm(const struct mm_struct *mm);
 	}								\
 	unlikely(__ret_warn_once);					\
 })
+#define VM_WARN_ON_FOLIO(cond, folio)		({			\
+	int __ret_warn = !!(cond);					\
+									\
+	if (unlikely(__ret_warn)) {					\
+		dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn);						\
+})
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)	({			\
 	static bool __section(".data.once") __warned;			\
 	int __ret_warn_once = !!(cond);					\
@@ -79,6 +88,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
-- 
cgit v1.2.3


From b53aca4b460ae2ece453963ef01667ee8ee78f52 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 9 Jun 2022 15:21:37 +0800
Subject: ceph: fix incorrect old_size length in ceph_mds_request_args

The 'old_size' is a __le64 type since birth, not sure why the
kclient incorrectly switched it to __le32. This change is okay
won't break anything because union will always allocate more memory
than the 'open' member needed.

Rename 'file_replication' to 'pool' as ceph did. Though this 'open'
struct may never be used in kclient in future, it's confusing when
going through the ceph code.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 40740e234ce1..49586ff26152 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -433,9 +433,9 @@ union ceph_mds_request_args {
 		__le32 stripe_unit;          /* layout for newly created file */
 		__le32 stripe_count;         /* ... */
 		__le32 object_size;
-		__le32 file_replication;
-               __le32 mask;                 /* CEPH_CAP_* */
-               __le32 old_size;
+		__le32 pool;
+		__le32 mask;                 /* CEPH_CAP_* */
+		__le64 old_size;
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
-- 
cgit v1.2.3


From 2c61c97fb12b806e1c8eb15f04c277ad097ec95e Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Wed, 8 Jun 2022 11:52:21 -0700
Subject: nvme: handle the persistent internal error AER

In the NVM Express Revision 1.4 spec, Figure 145 describes possible
values for an AER with event type "Error" (value 000b). For a
Persistent Internal Error (value 03h), the host should perform a
controller reset.

Add support for this error using code that already exists for
doing a controller reset. As part of this support, introduce
two utility functions for parsing the AER type and subtype.

This new support was tested in a lab environment where we can
generate the persistent internal error on demand, and observe
both the Linux side and NVMe controller side to see that the
controller reset has been done.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 31 +++++++++++++++++++++++++++++--
 include/linux/nvme.h     |  4 ++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2533b88e66d5..92d90dc82a9b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4525,9 +4525,19 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static u32 nvme_aer_type(u32 result)
+{
+	return result & 0x7;
+}
+
+static u32 nvme_aer_subtype(u32 result)
+{
+	return (result & 0xff00) >> 8;
+}
+
 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 {
-	u32 aer_notice_type = (result & 0xff00) >> 8;
+	u32 aer_notice_type = nvme_aer_subtype(result);
 
 	trace_nvme_async_event(ctrl, aer_notice_type);
 
@@ -4560,11 +4570,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 	}
 }
 
+static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
+{
+	trace_nvme_async_event(ctrl, NVME_AER_ERROR);
+	dev_warn(ctrl->device, "resetting controller due to AER\n");
+	nvme_reset_ctrl(ctrl);
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		volatile union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
-	u32 aer_type = result & 0x07;
+	u32 aer_type = nvme_aer_type(result);
+	u32 aer_subtype = nvme_aer_subtype(result);
 
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
@@ -4574,6 +4592,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		nvme_handle_aen_notice(ctrl, result);
 		break;
 	case NVME_AER_ERROR:
+		/*
+		 * For a persistent internal error, don't run async_event_work
+		 * to submit a new AER. The controller reset will do it.
+		 */
+		if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
+			nvme_handle_aer_persistent_error(ctrl);
+			return;
+		}
+		fallthrough;
 	case NVME_AER_SMART:
 	case NVME_AER_CSS:
 	case NVME_AER_VS:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 07cfc922f8e4..3b8fc6c69529 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -711,6 +711,10 @@ enum {
 	NVME_AER_VS			= 7,
 };
 
+enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
-- 
cgit v1.2.3


From a116e1cdc64a743c36c40e6fa639dec991c157a3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:51:59 +0200
Subject: lib/base64: RFC4648-compliant base64 encoding

Add RFC4648-compliant base64 encoding and decoding routines, based on
the base64url encoding in fs/crypto/fname.c.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/base64.h |  16 ++++++++
 lib/Makefile           |   2 +-
 lib/base64.c           | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/base64.h
 create mode 100644 lib/base64.c

(limited to 'include/linux')

diff --git a/include/linux/base64.h b/include/linux/base64.h
new file mode 100644
index 000000000000..660d4cb1ef31
--- /dev/null
+++ b/include/linux/base64.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64 encoding, lifted from fs/crypto/fname.c.
+ */
+
+#ifndef _LINUX_BASE64_H
+#define _LINUX_BASE64_H
+
+#include <linux/types.h>
+
+#define BASE64_CHARS(nbytes)   DIV_ROUND_UP((nbytes) * 4, 3)
+
+int base64_encode(const u8 *src, int len, char *dst);
+int base64_decode(const char *src, int len, u8 *dst);
+
+#endif /* _LINUX_BASE64_H */
diff --git a/lib/Makefile b/lib/Makefile
index f99bf61f8bbc..20d2caf6577d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o rhashtable.o \
+	 percpu-refcount.o rhashtable.o base64.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
diff --git a/lib/base64.c b/lib/base64.c
new file mode 100644
index 000000000000..b736a7a431c5
--- /dev/null
+++ b/lib/base64.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * base64.c - RFC4648-compliant base64 encoding
+ *
+ * Copyright (c) 2020 Hannes Reinecke, SUSE
+ *
+ * Based on the base64url routines from fs/crypto/fname.c
+ * (which are using the URL-safe base64 encoding),
+ * modified to use the standard coding table from RFC4648 section 4.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/base64.h>
+
+static const char base64_table[65] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+/**
+ * base64_encode() - base64-encode some binary data
+ * @src: the binary data to encode
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the base64-encoded string.  Not NUL-terminated.
+ *
+ * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified
+ * by RFC 4648, including the  '='-padding.
+ *
+ * Return: the length of the resulting base64-encoded string in bytes.
+ */
+int base64_encode(const u8 *src, int srclen, char *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	char *cp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		ac = (ac << 8) | src[i];
+		bits += 8;
+		do {
+			bits -= 6;
+			*cp++ = base64_table[(ac >> bits) & 0x3f];
+		} while (bits >= 6);
+	}
+	if (bits) {
+		*cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
+		bits -= 6;
+	}
+	while (bits < 0) {
+		*cp++ = '=';
+		bits += 2;
+	}
+	return cp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_encode);
+
+/**
+ * base64_decode() - base64-decode a string
+ * @src: the string to decode.  Doesn't need to be NUL-terminated.
+ * @srclen: the length of @src in bytes
+ * @dst: (output) the decoded binary data
+ *
+ * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding"
+ * specified by RFC 4648, including the  '='-padding.
+ *
+ * This implementation hasn't been optimized for performance.
+ *
+ * Return: the length of the resulting decoded binary data in bytes,
+ *	   or -1 if the string isn't a valid base64 string.
+ */
+int base64_decode(const char *src, int srclen, u8 *dst)
+{
+	u32 ac = 0;
+	int bits = 0;
+	int i;
+	u8 *bp = dst;
+
+	for (i = 0; i < srclen; i++) {
+		const char *p = strchr(base64_table, src[i]);
+
+		if (src[i] == '=') {
+			ac = (ac << 6);
+			bits += 6;
+			if (bits >= 8)
+				bits -= 8;
+			continue;
+		}
+		if (p == NULL || src[i] == 0)
+			return -1;
+		ac = (ac << 6) | (p - base64_table);
+		bits += 6;
+		if (bits >= 8) {
+			bits -= 8;
+			*bp++ = (u8)(ac >> bits);
+		}
+	}
+	if (ac & ((1 << bits) - 1))
+		return -1;
+	return bp - dst;
+}
+EXPORT_SYMBOL_GPL(base64_decode);
-- 
cgit v1.2.3


From 88b140fec07307f825170a45562013a80842cc93 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:00 +0200
Subject: nvme: add definitions for NVMe In-Band authentication

Add new definitions for NVMe In-band authentication as defined in
the NVMe Base Specification v2.0.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/nvme.h | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 208 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3b8fc6c69529..ae53d74f3696 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -19,6 +19,7 @@
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
+#define NVMF_AUTH_HASH_LEN	64
 
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
@@ -1373,6 +1374,8 @@ enum nvmf_capsule_command {
 	nvme_fabrics_type_property_set	= 0x00,
 	nvme_fabrics_type_connect	= 0x01,
 	nvme_fabrics_type_property_get	= 0x04,
+	nvme_fabrics_type_auth_send	= 0x05,
+	nvme_fabrics_type_auth_receive	= 0x06,
 };
 
 #define nvme_fabrics_type_name(type)   { type, #type }
@@ -1380,7 +1383,9 @@ enum nvmf_capsule_command {
 	__print_symbolic(type,						\
 		nvme_fabrics_type_name(nvme_fabrics_type_property_set),	\
 		nvme_fabrics_type_name(nvme_fabrics_type_connect),	\
-		nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+		nvme_fabrics_type_name(nvme_fabrics_type_property_get), \
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_send),	\
+		nvme_fabrics_type_name(nvme_fabrics_type_auth_receive))
 
 /*
  * If not fabrics command, fctype will be ignored.
@@ -1476,6 +1481,11 @@ struct nvmf_connect_command {
 	__u8		resv4[12];
 };
 
+enum {
+	NVME_CONNECT_AUTHREQ_ASCR	= (1 << 2),
+	NVME_CONNECT_AUTHREQ_ATR	= (1 << 1),
+};
+
 struct nvmf_connect_data {
 	uuid_t		hostid;
 	__le16		cntlid;
@@ -1510,6 +1520,200 @@ struct nvmf_property_get_command {
 	__u8		resv4[16];
 };
 
+struct nvmf_auth_common_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al_tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_send_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		tl;
+	__u8		resv4[16];
+};
+
+struct nvmf_auth_receive_command {
+	__u8		opcode;
+	__u8		resv1;
+	__u16		command_id;
+	__u8		fctype;
+	__u8		resv2[19];
+	union nvme_data_ptr dptr;
+	__u8		resv3;
+	__u8		spsp0;
+	__u8		spsp1;
+	__u8		secp;
+	__le32		al;
+	__u8		resv4[16];
+};
+
+/* Value for secp */
+enum {
+	NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER	= 0xe9,
+};
+
+/* Defined value for auth_type */
+enum {
+	NVME_AUTH_COMMON_MESSAGES	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGES	= 0x01,
+};
+
+/* Defined messages for auth_id */
+enum {
+	NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE	= 0x00,
+	NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE	= 0x01,
+	NVME_AUTH_DHCHAP_MESSAGE_REPLY		= 0x02,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1	= 0x03,
+	NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2	= 0x04,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE2	= 0xf0,
+	NVME_AUTH_DHCHAP_MESSAGE_FAILURE1	= 0xf1,
+};
+
+struct nvmf_auth_dhchap_protocol_descriptor {
+	__u8		authid;
+	__u8		rsvd;
+	__u8		halen;
+	__u8		dhlen;
+	__u8		idlist[60];
+};
+
+enum {
+	NVME_AUTH_DHCHAP_AUTH_ID	= 0x01,
+};
+
+/* Defined hash functions for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_HASH_SHA256	= 0x01,
+	NVME_AUTH_HASH_SHA384	= 0x02,
+	NVME_AUTH_HASH_SHA512	= 0x03,
+	NVME_AUTH_HASH_INVALID	= 0xff,
+};
+
+/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */
+enum {
+	NVME_AUTH_DHGROUP_NULL		= 0x00,
+	NVME_AUTH_DHGROUP_2048		= 0x01,
+	NVME_AUTH_DHGROUP_3072		= 0x02,
+	NVME_AUTH_DHGROUP_4096		= 0x03,
+	NVME_AUTH_DHGROUP_6144		= 0x04,
+	NVME_AUTH_DHGROUP_8192		= 0x05,
+	NVME_AUTH_DHGROUP_INVALID	= 0xff,
+};
+
+union nvmf_auth_protocol {
+	struct nvmf_auth_dhchap_protocol_descriptor dhchap;
+};
+
+struct nvmf_auth_dhchap_negotiate_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd;
+	__le16		t_id;
+	__u8		sc_c;
+	__u8		napd;
+	union nvmf_auth_protocol auth_protocol[];
+};
+
+struct nvmf_auth_dhchap_challenge_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__u16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		hashid;
+	__u8		dhgid;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of challenge value */
+	__u8		cval[];
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+struct nvmf_auth_dhchap_reply_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		cvalid;
+	__u8		rsvd3;
+	__le16		dhvlen;
+	__le32		seqnum;
+	/* 'hl' bytes of response data */
+	__u8		rval[];
+	/* followed by 'hl' bytes of Challenge value */
+	/* followed by 'dhvlen' bytes of DH value */
+};
+
+enum {
+	NVME_AUTH_DHCHAP_RESPONSE_VALID	= (1 << 0),
+};
+
+struct nvmf_auth_dhchap_success1_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		hl;
+	__u8		rsvd2;
+	__u8		rvalid;
+	__u8		rsvd3[7];
+	/* 'hl' bytes of response value if 'rvalid' is set */
+	__u8		rval[];
+};
+
+struct nvmf_auth_dhchap_success2_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rsvd2[10];
+};
+
+struct nvmf_auth_dhchap_failure_data {
+	__u8		auth_type;
+	__u8		auth_id;
+	__le16		rsvd1;
+	__le16		t_id;
+	__u8		rescode;
+	__u8		rescode_exp;
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED	= 0x01,
+};
+
+enum {
+	NVME_AUTH_DHCHAP_FAILURE_FAILED			= 0x01,
+	NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE		= 0x02,
+	NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH	= 0x03,
+	NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE		= 0x04,
+	NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE	= 0x05,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD	= 0x06,
+	NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE	= 0x07,
+};
+
+
 struct nvme_dbbuf {
 	__u8			opcode;
 	__u8			flags;
@@ -1553,6 +1757,9 @@ struct nvme_command {
 		struct nvmf_connect_command connect;
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
+		struct nvmf_auth_common_command auth_common;
+		struct nvmf_auth_send_command auth_send;
+		struct nvmf_auth_receive_command auth_receive;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
 	};
-- 
cgit v1.2.3


From f50fff73d620cd6e8f48bc58d4f1c944615a3fea Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:02 +0200
Subject: nvme: implement In-Band authentication

Implement NVMe-oF In-Band authentication according to NVMe TPAR 8006.
This patch adds two new fabric options 'dhchap_secret' to specify the
pre-shared key (in ASCII respresentation according to NVMe 2.0 section
8.13.5.8 'Secret representation') and 'dhchap_ctrl_secret' to specify
the pre-shared controller key for bi-directional authentication of both
the host and the controller.
Re-authentication can be triggered by writing the PSK into the new
controller sysfs attribute 'dhchap_secret' or 'dhchap_ctrl_secret'.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[axboe: fold in clang build fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/Kconfig         |   1 +
 drivers/nvme/Makefile        |   1 +
 drivers/nvme/common/Kconfig  |   4 +
 drivers/nvme/common/Makefile |   7 +
 drivers/nvme/common/auth.c   | 323 +++++++++++++++++
 drivers/nvme/host/Kconfig    |  13 +
 drivers/nvme/host/Makefile   |   1 +
 drivers/nvme/host/auth.c     | 828 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c     | 143 +++++++-
 drivers/nvme/host/fabrics.c  |  80 ++++-
 drivers/nvme/host/fabrics.h  |   7 +
 drivers/nvme/host/nvme.h     |  30 ++
 drivers/nvme/host/rdma.c     |   1 +
 drivers/nvme/host/tcp.c      |   1 +
 drivers/nvme/host/trace.c    |  32 ++
 include/linux/nvme-auth.h    |  33 ++
 16 files changed, 1498 insertions(+), 7 deletions(-)
 create mode 100644 drivers/nvme/common/Kconfig
 create mode 100644 drivers/nvme/common/Makefile
 create mode 100644 drivers/nvme/common/auth.c
 create mode 100644 drivers/nvme/host/auth.c
 create mode 100644 include/linux/nvme-auth.h

(limited to 'include/linux')

diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index 87ae409a32b9..656e46d938da 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 menu "NVME Support"
 
+source "drivers/nvme/common/Kconfig"
 source "drivers/nvme/host/Kconfig"
 source "drivers/nvme/target/Kconfig"
 
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index fb42c44609a8..eedca8c72098 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+obj-$(CONFIG_NVME_COMMON)		+= common/
 obj-y		+= host/
 obj-y		+= target/
diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig
new file mode 100644
index 000000000000..4514f44362dd
--- /dev/null
+++ b/drivers/nvme/common/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NVME_COMMON
+       tristate
diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile
new file mode 100644
index 000000000000..720c625b8a52
--- /dev/null
+++ b/drivers/nvme/common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y			+= -I$(src)
+
+obj-$(CONFIG_NVME_COMMON)	+= nvme-common.o
+
+nvme-common-y			+= auth.o
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
new file mode 100644
index 000000000000..adb43d341ffb
--- /dev/null
+++ b/drivers/nvme/common/auth.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include <linux/nvme.h>
+#include <linux/nvme-auth.h>
+
+static u32 nvme_dhchap_seqnum;
+static DEFINE_MUTEX(nvme_dhchap_mutex);
+
+u32 nvme_auth_get_seqnum(void)
+{
+	u32 seqnum;
+
+	mutex_lock(&nvme_dhchap_mutex);
+	if (!nvme_dhchap_seqnum)
+		nvme_dhchap_seqnum = prandom_u32();
+	else {
+		nvme_dhchap_seqnum++;
+		if (!nvme_dhchap_seqnum)
+			nvme_dhchap_seqnum++;
+	}
+	seqnum = nvme_dhchap_seqnum;
+	mutex_unlock(&nvme_dhchap_mutex);
+	return seqnum;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
+
+static struct nvme_auth_dhgroup_map {
+	const char name[16];
+	const char kpp[16];
+} dhgroup_map[] = {
+	[NVME_AUTH_DHGROUP_NULL] = {
+		.name = "null", .kpp = "null" },
+	[NVME_AUTH_DHGROUP_2048] = {
+		.name = "ffdhe2048", .kpp = "ffdhe2048(dh)" },
+	[NVME_AUTH_DHGROUP_3072] = {
+		.name = "ffdhe3072", .kpp = "ffdhe3072(dh)" },
+	[NVME_AUTH_DHGROUP_4096] = {
+		.name = "ffdhe4096", .kpp = "ffdhe4096(dh)" },
+	[NVME_AUTH_DHGROUP_6144] = {
+		.name = "ffdhe6144", .kpp = "ffdhe6144(dh)" },
+	[NVME_AUTH_DHGROUP_8192] = {
+		.name = "ffdhe8192", .kpp = "ffdhe8192(dh)" },
+};
+
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id)
+{
+	if (dhgroup_id > ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].name;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name);
+
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id)
+{
+	if (dhgroup_id > ARRAY_SIZE(dhgroup_map))
+		return NULL;
+	return dhgroup_map[dhgroup_id].kpp;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp);
+
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
+{
+	int i;
+
+	if (!dhgroup_name || !strlen(dhgroup_name))
+		return NVME_AUTH_DHGROUP_INVALID;
+	for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) {
+		if (!strlen(dhgroup_map[i].name))
+			continue;
+		if (!strncmp(dhgroup_map[i].name, dhgroup_name,
+			     strlen(dhgroup_map[i].name)))
+			return i;
+	}
+	return NVME_AUTH_DHGROUP_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
+
+static struct nvme_dhchap_hash_map {
+	int len;
+	const char hmac[15];
+	const char digest[8];
+} hash_map[] = {
+	[NVME_AUTH_HASH_SHA256] = {
+		.len = 32,
+		.hmac = "hmac(sha256)",
+		.digest = "sha256",
+	},
+	[NVME_AUTH_HASH_SHA384] = {
+		.len = 48,
+		.hmac = "hmac(sha384)",
+		.digest = "sha384",
+	},
+	[NVME_AUTH_HASH_SHA512] = {
+		.len = 64,
+		.hmac = "hmac(sha512)",
+		.digest = "sha512",
+	},
+};
+
+const char *nvme_auth_hmac_name(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].hmac;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
+
+const char *nvme_auth_digest_name(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return NULL;
+	return hash_map[hmac_id].digest;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
+
+u8 nvme_auth_hmac_id(const char *hmac_name)
+{
+	int i;
+
+	if (!hmac_name || !strlen(hmac_name))
+		return NVME_AUTH_HASH_INVALID;
+
+	for (i = 0; i < ARRAY_SIZE(hash_map); i++) {
+		if (!strlen(hash_map[i].hmac))
+			continue;
+		if (!strncmp(hash_map[i].hmac, hmac_name,
+			     strlen(hash_map[i].hmac)))
+			return i;
+	}
+	return NVME_AUTH_HASH_INVALID;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_id);
+
+size_t nvme_auth_hmac_hash_len(u8 hmac_id)
+{
+	if (hmac_id > ARRAY_SIZE(hash_map))
+		return 0;
+	return hash_map[hmac_id].len;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash)
+{
+	struct nvme_dhchap_key *key;
+	unsigned char *p;
+	u32 crc;
+	int ret, key_len;
+	size_t allocated_len = strlen(secret);
+
+	/* Secret might be affixed with a ':' */
+	p = strrchr(secret, ':');
+	if (p)
+		allocated_len = p - secret;
+	key = kzalloc(sizeof(*key), GFP_KERNEL);
+	if (!key)
+		return ERR_PTR(-ENOMEM);
+	key->key = kzalloc(allocated_len, GFP_KERNEL);
+	if (!key->key) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	key_len = base64_decode(secret, allocated_len, key->key);
+	if (key_len < 0) {
+		pr_debug("base64 key decoding error %d\n",
+			 key_len);
+		ret = key_len;
+		goto out_free_secret;
+	}
+
+	if (key_len != 36 && key_len != 52 &&
+	    key_len != 68) {
+		pr_err("Invalid key len %d\n", key_len);
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	if (key_hash > 0 &&
+	    (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) {
+		pr_err("Mismatched key len %d for %s\n", key_len,
+		       nvme_auth_hmac_name(key_hash));
+		ret = -EINVAL;
+		goto out_free_secret;
+	}
+
+	/* The last four bytes is the CRC in little-endian format */
+	key_len -= 4;
+	/*
+	 * The linux implementation doesn't do pre- and post-increments,
+	 * so we have to do it manually.
+	 */
+	crc = ~crc32(~0, key->key, key_len);
+
+	if (get_unaligned_le32(key->key + key_len) != crc) {
+		pr_err("key crc mismatch (key %08x, crc %08x)\n",
+		       get_unaligned_le32(key->key + key_len), crc);
+		ret = -EKEYREJECTED;
+		goto out_free_secret;
+	}
+	key->len = key_len;
+	key->hash = key_hash;
+	return key;
+out_free_secret:
+	kfree_sensitive(key->key);
+out_free_key:
+	kfree(key);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_extract_key);
+
+void nvme_auth_free_key(struct nvme_dhchap_key *key)
+{
+	if (!key)
+		return;
+	kfree_sensitive(key->key);
+	kfree(key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free_key);
+
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn)
+{
+	const char *hmac_name;
+	struct crypto_shash *key_tfm;
+	struct shash_desc *shash;
+	u8 *transformed_key;
+	int ret;
+
+	if (!key || !key->key) {
+		pr_warn("No key specified\n");
+		return ERR_PTR(-ENOKEY);
+	}
+	if (key->hash == 0) {
+		transformed_key = kmemdup(key->key, key->len, GFP_KERNEL);
+		return transformed_key ? transformed_key : ERR_PTR(-ENOMEM);
+	}
+	hmac_name = nvme_auth_hmac_name(key->hash);
+	if (!hmac_name) {
+		pr_warn("Invalid key hash id %d\n", key->hash);
+		return ERR_PTR(-EINVAL);
+	}
+
+	key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(key_tfm))
+		return (u8 *)key_tfm;
+
+	shash = kmalloc(sizeof(struct shash_desc) +
+			crypto_shash_descsize(key_tfm),
+			GFP_KERNEL);
+	if (!shash) {
+		ret = -ENOMEM;
+		goto out_free_key;
+	}
+
+	transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL);
+	if (!transformed_key) {
+		ret = -ENOMEM;
+		goto out_free_shash;
+	}
+
+	shash->tfm = key_tfm;
+	ret = crypto_shash_setkey(key_tfm, key->key, key->len);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_init(shash);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_update(shash, nqn, strlen(nqn));
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
+	if (ret < 0)
+		goto out_free_shash;
+	ret = crypto_shash_final(shash, transformed_key);
+out_free_shash:
+	kfree(shash);
+out_free_key:
+	crypto_free_shash(key_tfm);
+	if (ret < 0) {
+		kfree_sensitive(transformed_key);
+		return ERR_PTR(ret);
+	}
+	return transformed_key;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
+
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
+{
+	struct nvme_dhchap_key *key;
+	u8 key_hash;
+
+	if (!secret) {
+		*ret_key = NULL;
+		return 0;
+	}
+
+	if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1)
+		return -EINVAL;
+
+	/* Pass in the secret without the 'DHHC-1:XX:' prefix */
+	key = nvme_auth_extract_key(secret + 10, key_hash);
+	if (IS_ERR(key)) {
+		*ret_key = NULL;
+		return PTR_ERR(key);
+	}
+
+	*ret_key = key;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 877d2ec4ea9f..6c503f42f3c6 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -92,6 +92,19 @@ config NVME_TCP
 
 	  If unsure, say N.
 
+config NVME_AUTH
+	bool "NVM Express over Fabrics In-Band Authentication"
+	depends on NVME_CORE
+	select NVME_COMMON
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
+	select CRYPTO_SHA512
+	help
+	  This provides support for NVMe over Fabrics In-Band Authentication.
+
+	  If unsure, say N.
+
 config NVME_APPLE
 	tristate "Apple ANS2 NVM Express host driver"
 	depends on OF && BLOCK
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a36ae1612059..a3e88f32f560 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -16,6 +16,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
+nvme-core-$(CONFIG_NVME_AUTH)		+= auth.o
 
 nvme-y					+= pci.o
 
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
new file mode 100644
index 000000000000..9766bfffecac
--- /dev/null
+++ b/drivers/nvme/host/auth.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Hannes Reinecke, SUSE Linux
+ */
+
+#include <linux/crc32.h>
+#include <linux/base64.h>
+#include <linux/prandom.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/dh.h>
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-auth.h>
+
+struct nvme_dhchap_queue_context {
+	struct list_head entry;
+	struct work_struct auth_work;
+	struct nvme_ctrl *ctrl;
+	struct crypto_shash *shash_tfm;
+	void *buf;
+	size_t buf_size;
+	int qid;
+	int error;
+	u32 s1;
+	u32 s2;
+	u16 transaction;
+	u8 status;
+	u8 hash_id;
+	size_t hash_len;
+	u8 dhgroup_id;
+	u8 c1[64];
+	u8 c2[64];
+	u8 response[64];
+	u8 *host_response;
+};
+
+#define nvme_auth_flags_from_qid(qid) \
+	(qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED
+#define nvme_auth_queue_from_qid(ctrl, qid) \
+	(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
+
+static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
+			    void *data, size_t data_len, bool auth_send)
+{
+	struct nvme_command cmd = {};
+	blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid);
+	struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid);
+	int ret;
+
+	cmd.auth_common.opcode = nvme_fabrics_command;
+	cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER;
+	cmd.auth_common.spsp0 = 0x01;
+	cmd.auth_common.spsp1 = 0x01;
+	if (auth_send) {
+		cmd.auth_send.fctype = nvme_fabrics_type_auth_send;
+		cmd.auth_send.tl = cpu_to_le32(data_len);
+	} else {
+		cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive;
+		cmd.auth_receive.al = cpu_to_le32(data_len);
+	}
+
+	ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len,
+				     qid == 0 ? NVME_QID_ANY : qid,
+				     0, flags);
+	if (ret > 0)
+		dev_warn(ctrl->device,
+			"qid %d auth_send failed with status %d\n", qid, ret);
+	else if (ret < 0)
+		dev_err(ctrl->device,
+			"qid %d auth_send failed with error %d\n", qid, ret);
+	return ret;
+}
+
+static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid,
+		struct nvmf_auth_dhchap_failure_data *data,
+		u16 transaction, u8 expected_msg)
+{
+	dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n",
+		__func__, qid, data->auth_type, data->auth_id);
+
+	if (data->auth_type == NVME_AUTH_COMMON_MESSAGES &&
+	    data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+		return data->rescode_exp;
+	}
+	if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES ||
+	    data->auth_id != expected_msg) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid message %02x/%02x\n",
+			 qid, data->auth_type, data->auth_id);
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	if (le16_to_cpu(data->t_id) != transaction) {
+		dev_warn(ctrl->device,
+			 "qid %d invalid transaction ID %d\n",
+			 qid, le16_to_cpu(data->t_id));
+		return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE;
+	}
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
+	size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+	memset((u8 *)chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->sc_c = 0; /* No secure channel concatenation */
+	data->napd = 1;
+	data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
+	data->auth_protocol[0].dhchap.halen = 3;
+	data->auth_protocol[0].dhchap.dhlen = 6;
+	data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
+	data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
+	data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
+	data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
+	data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
+	data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
+	data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
+	data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
+	data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
+
+	return size;
+}
+
+static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
+	u16 dhvlen = le16_to_cpu(data->dhvlen);
+	size_t size = sizeof(*data) + data->hl + dhvlen;
+	const char *hmac_name, *kpp_name;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	hmac_name = nvme_auth_hmac_name(data->hashid);
+	if (!hmac_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid HASH ID %d\n",
+			 chap->qid, data->hashid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (chap->hash_id == data->hashid && chap->shash_tfm &&
+	    !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
+	    crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing hash %s\n",
+			chap->qid, hmac_name);
+		goto select_kpp;
+	}
+
+	/* Reset if hash cannot be reused */
+	if (chap->shash_tfm) {
+		crypto_free_shash(chap->shash_tfm);
+		chap->hash_id = 0;
+		chap->hash_len = 0;
+	}
+	chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
+					     CRYPTO_ALG_ALLOCATES_MEMORY);
+	if (IS_ERR(chap->shash_tfm)) {
+		dev_warn(ctrl->device,
+			 "qid %d: failed to allocate hash %s, error %ld\n",
+			 chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %d\n",
+			 chap->qid, data->hl);
+		crypto_free_shash(chap->shash_tfm);
+		chap->shash_tfm = NULL;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Reset host response if the hash had been changed */
+	if (chap->hash_id != data->hashid) {
+		kfree(chap->host_response);
+		chap->host_response = NULL;
+	}
+
+	chap->hash_id = data->hashid;
+	chap->hash_len = data->hl;
+	dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
+		chap->qid, hmac_name);
+
+select_kpp:
+	kpp_name = nvme_auth_dhgroup_kpp(data->dhgid);
+	if (!kpp_name) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH group id %d\n",
+			 chap->qid, data->dhgid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
+		dev_warn(ctrl->device,
+			 "qid %d: unsupported DH group %s\n",
+			 chap->qid, kpp_name);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		return NVME_SC_AUTH_REQUIRED;
+	} else if (dhvlen != 0) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid DH value for NULL DH\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+	chap->dhgroup_id = data->dhgid;
+
+	chap->s1 = le32_to_cpu(data->seqnum);
+	memcpy(chap->c1, data->cval, chap->hash_len);
+
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_reply_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	size += 2 * chap->hash_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return -EINVAL;
+	}
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->hl = chap->hash_len;
+	data->dhvlen = 0;
+	memcpy(data->rval, chap->response, chap->hash_len);
+	if (ctrl->ctrl_key) {
+		get_random_bytes(chap->c2, chap->hash_len);
+		data->cvalid = 1;
+		chap->s2 = nvme_auth_get_seqnum();
+		memcpy(data->rval + chap->hash_len, chap->c2,
+		       chap->hash_len);
+		dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, chap->c2);
+	} else {
+		memset(chap->c2, 0, chap->hash_len);
+		chap->s2 = 0;
+	}
+	data->seqnum = cpu_to_le32(chap->s2);
+	return size;
+}
+
+static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success1_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	if (ctrl->ctrl_key)
+		size += chap->hash_len;
+
+	if (chap->buf_size < size) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	if (data->hl != chap->hash_len) {
+		dev_warn(ctrl->device,
+			 "qid %d: invalid hash length %u\n",
+			 chap->qid, data->hl);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
+		return NVME_SC_INVALID_FIELD;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: authenticated with hash %s dhgroup %s\n",
+			 nvme_auth_hmac_name(chap->hash_id),
+			 nvme_auth_dhgroup_name(chap->dhgroup_id));
+
+	if (!data->rvalid)
+		return 0;
+
+	/* Validate controller response */
+	if (memcmp(chap->response, data->rval, data->hl)) {
+		dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len, data->rval);
+		dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n",
+			__func__, chap->qid, (int)chap->hash_len,
+			chap->response);
+		dev_warn(ctrl->device,
+			 "qid %d: controller authentication failed\n",
+			 chap->qid);
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return NVME_SC_AUTH_REQUIRED;
+	}
+
+	/* Just print out information for the admin queue */
+	if (chap->qid == 0)
+		dev_info(ctrl->device,
+			 "qid 0: controller authenticated\n");
+	return 0;
+}
+
+static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_success2_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_DHCHAP_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2;
+	data->t_id = cpu_to_le16(chap->transaction);
+
+	return size;
+}
+
+static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	struct nvmf_auth_dhchap_failure_data *data = chap->buf;
+	size_t size = sizeof(*data);
+
+	memset(chap->buf, 0, size);
+	data->auth_type = NVME_AUTH_COMMON_MESSAGES;
+	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2;
+	data->t_id = cpu_to_le16(chap->transaction);
+	data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED;
+	data->rescode_exp = chap->status;
+
+	return size;
+}
+
+static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 buf[4], *challenge = chap->c1;
+	int ret;
+
+	dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s1, chap->transaction);
+
+	if (!chap->host_response) {
+		chap->host_response = nvme_auth_transform_key(ctrl->host_key,
+						ctrl->opts->host->nqn);
+		if (IS_ERR(chap->host_response)) {
+			ret = PTR_ERR(chap->host_response);
+			chap->host_response = NULL;
+			return ret;
+		}
+	} else {
+		dev_dbg(ctrl->device, "%s: qid %d re-using host response\n",
+			__func__, chap->qid);
+	}
+
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			chap->host_response, ctrl->host_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s1, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, sizeof(buf));
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "HostHost", 8);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+			    strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c1)
+		kfree(challenge);
+	return ret;
+}
+
+static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	u8 *ctrl_response;
+	u8 buf[4], *challenge = chap->c2;
+	int ret;
+
+	ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key,
+				ctrl->opts->subsysnqn);
+	if (IS_ERR(ctrl_response)) {
+		ret = PTR_ERR(ctrl_response);
+		return ret;
+	}
+	ret = crypto_shash_setkey(chap->shash_tfm,
+			ctrl_response, ctrl->ctrl_key->len);
+	if (ret) {
+		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+			 chap->qid, ret);
+		goto out;
+	}
+
+	dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
+		__func__, chap->qid, chap->s2, chap->transaction);
+	dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
+		__func__, chap->qid, (int)chap->hash_len, challenge);
+	dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n",
+		__func__, chap->qid, ctrl->opts->subsysnqn);
+	dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
+		__func__, chap->qid, ctrl->opts->host->nqn);
+	shash->tfm = chap->shash_tfm;
+	ret = crypto_shash_init(shash);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, challenge, chap->hash_len);
+	if (ret)
+		goto out;
+	put_unaligned_le32(chap->s2, buf);
+	ret = crypto_shash_update(shash, buf, 4);
+	if (ret)
+		goto out;
+	put_unaligned_le16(chap->transaction, buf);
+	ret = crypto_shash_update(shash, buf, 2);
+	if (ret)
+		goto out;
+	memset(buf, 0, 4);
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, "Controller", 10);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
+				  strlen(ctrl->opts->subsysnqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, buf, 1);
+	if (ret)
+		goto out;
+	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
+				  strlen(ctrl->opts->host->nqn));
+	if (ret)
+		goto out;
+	ret = crypto_shash_final(shash, chap->response);
+out:
+	if (challenge != chap->c2)
+		kfree(challenge);
+	kfree(ctrl_response);
+	return ret;
+}
+
+static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
+{
+	chap->status = 0;
+	chap->error = 0;
+	chap->s1 = 0;
+	chap->s2 = 0;
+	chap->transaction = 0;
+	memset(chap->c1, 0, sizeof(chap->c1));
+	memset(chap->c2, 0, sizeof(chap->c2));
+}
+
+static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
+{
+	__nvme_auth_reset(chap);
+	if (chap->shash_tfm)
+		crypto_free_shash(chap->shash_tfm);
+	kfree_sensitive(chap->host_response);
+	kfree(chap->buf);
+	kfree(chap);
+}
+
+static void __nvme_auth_work(struct work_struct *work)
+{
+	struct nvme_dhchap_queue_context *chap =
+		container_of(work, struct nvme_dhchap_queue_context, auth_work);
+	struct nvme_ctrl *ctrl = chap->ctrl;
+	size_t tl;
+	int ret = 0;
+
+	chap->transaction = ctrl->transaction++;
+
+	/* DH-HMAC-CHAP Step 1: send negotiate */
+	dev_dbg(ctrl->device, "%s: qid %d send negotiate\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		return;
+	}
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		return;
+	}
+
+	/* DH-HMAC-CHAP Step 2: receive challenge */
+	dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive challenge, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	ret = nvme_auth_process_dhchap_challenge(ctrl, chap);
+	if (ret) {
+		/* Invalid challenge parameters */
+		chap->error = ret;
+		goto fail2;
+	}
+
+	dev_dbg(ctrl->device, "%s: qid %d host response\n",
+		__func__, chap->qid);
+	ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 3: send reply */
+	dev_dbg(ctrl->device, "%s: qid %d send reply\n",
+		__func__, chap->qid);
+	ret = nvme_auth_set_dhchap_reply_data(ctrl, chap);
+	if (ret < 0) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	tl = ret;
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	if (ret) {
+		chap->error = ret;
+		goto fail2;
+	}
+
+	/* DH-HMAC-CHAP Step 4: receive success1 */
+	dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
+		__func__, chap->qid);
+
+	memset(chap->buf, 0, chap->buf_size);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid %d failed to receive success1, %s %d\n",
+			 chap->qid, ret < 0 ? "error" : "nvme status", ret);
+		chap->error = ret;
+		return;
+	}
+	ret = nvme_auth_receive_validate(ctrl, chap->qid,
+					 chap->buf, chap->transaction,
+					 NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1);
+	if (ret) {
+		chap->status = ret;
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		return;
+	}
+
+	if (ctrl->ctrl_key) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d controller response\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
+	ret = nvme_auth_process_dhchap_success1(ctrl, chap);
+	if (ret) {
+		/* Controller authentication failed */
+		chap->error = NVME_SC_AUTH_REQUIRED;
+		goto fail2;
+	}
+
+	if (ctrl->ctrl_key) {
+		/* DH-HMAC-CHAP Step 5: send success2 */
+		dev_dbg(ctrl->device, "%s: qid %d send success2\n",
+			__func__, chap->qid);
+		tl = nvme_auth_set_dhchap_success2_data(ctrl, chap);
+		ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+		if (ret)
+			chap->error = ret;
+	}
+	if (!ret) {
+		chap->error = 0;
+		return;
+	}
+
+fail2:
+	dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n",
+		__func__, chap->qid, chap->status);
+	tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap);
+	ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true);
+	/*
+	 * only update error if send failure2 failed and no other
+	 * error had been set during authentication.
+	 */
+	if (ret && !chap->error)
+		chap->error = ret;
+}
+
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	if (!ctrl->host_key) {
+		dev_warn(ctrl->device, "qid %d: no key\n", qid);
+		return -ENOKEY;
+	}
+
+	if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) {
+		dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid);
+		return -ENOKEY;
+	}
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	/* Check if the context is already queued */
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		WARN_ON(!chap->buf);
+		if (chap->qid == qid) {
+			dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
+			mutex_unlock(&ctrl->dhchap_auth_mutex);
+			flush_work(&chap->auth_work);
+			__nvme_auth_reset(chap);
+			queue_work(nvme_wq, &chap->auth_work);
+			return 0;
+		}
+	}
+	chap = kzalloc(sizeof(*chap), GFP_KERNEL);
+	if (!chap) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		return -ENOMEM;
+	}
+	chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
+	chap->ctrl = ctrl;
+
+	/*
+	 * Allocate a large enough buffer for the entire negotiation:
+	 * 4k should be enough to ffdhe8192.
+	 */
+	chap->buf_size = 4096;
+	chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
+	if (!chap->buf) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		kfree(chap);
+		return -ENOMEM;
+	}
+
+	INIT_WORK(&chap->auth_work, __nvme_auth_work);
+	list_add(&chap->entry, &ctrl->dhchap_auth_list);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	queue_work(nvme_wq, &chap->auth_work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_negotiate);
+
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	struct nvme_dhchap_queue_context *chap;
+	int ret;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		if (chap->qid != qid)
+			continue;
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		ret = chap->error;
+		return ret;
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_wait);
+
+void nvme_auth_reset(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
+		mutex_unlock(&ctrl->dhchap_auth_mutex);
+		flush_work(&chap->auth_work);
+		__nvme_auth_reset(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_reset);
+
+static void nvme_dhchap_auth_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, dhchap_auth_work);
+	int ret, q;
+
+	/* Authenticate admin queue first */
+	ret = nvme_auth_negotiate(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: error %d setting up authentication\n", ret);
+		return;
+	}
+	ret = nvme_auth_wait(ctrl, 0);
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "qid 0: authentication failed\n");
+		return;
+	}
+
+	for (q = 1; q < ctrl->queue_count; q++) {
+		ret = nvme_auth_negotiate(ctrl, q);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: error %d setting up authentication\n",
+				 q, ret);
+			break;
+		}
+	}
+
+	/*
+	 * Failure is a soft-state; credentials remain valid until
+	 * the controller terminates the connection.
+	 */
+}
+
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+{
+	INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
+	INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
+	mutex_init(&ctrl->dhchap_auth_mutex);
+	if (!ctrl->opts)
+		return;
+	nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
+	nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
+
+void nvme_auth_stop(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	cancel_work_sync(&ctrl->dhchap_auth_work);
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
+		cancel_work_sync(&chap->auth_work);
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_stop);
+
+void nvme_auth_free(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+
+	mutex_lock(&ctrl->dhchap_auth_mutex);
+	list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
+		list_del_init(&chap->entry);
+		flush_work(&chap->auth_work);
+		__nvme_auth_free(chap);
+	}
+	mutex_unlock(&ctrl->dhchap_auth_mutex);
+	if (ctrl->host_key) {
+		nvme_auth_free_key(ctrl->host_key);
+		ctrl->host_key = NULL;
+	}
+	if (ctrl->ctrl_key) {
+		nvme_auth_free_key(ctrl->ctrl_key);
+		ctrl->ctrl_key = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_auth_free);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0fe0ed6a28ea..4aa869e37b4c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -24,6 +24,7 @@
 
 #include "nvme.h"
 #include "fabrics.h"
+#include <linux/nvme-auth.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -330,6 +331,7 @@ enum nvme_disposition {
 	COMPLETE,
 	RETRY,
 	FAILOVER,
+	AUTHENTICATE,
 };
 
 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
@@ -337,6 +339,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	if (likely(nvme_req(req)->status == 0))
 		return COMPLETE;
 
+	if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED)
+		return AUTHENTICATE;
+
 	if (blk_noretry_request(req) ||
 	    (nvme_req(req)->status & NVME_SC_DNR) ||
 	    nvme_req(req)->retries >= nvme_max_retries)
@@ -375,11 +380,13 @@ static inline void nvme_end_req(struct request *req)
 
 void nvme_complete_rq(struct request *req)
 {
+	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
+
 	trace_nvme_complete_rq(req);
 	nvme_cleanup_cmd(req);
 
-	if (nvme_req(req)->ctrl->kas)
-		nvme_req(req)->ctrl->comp_seen = true;
+	if (ctrl->kas)
+		ctrl->comp_seen = true;
 
 	switch (nvme_decide_disposition(req)) {
 	case COMPLETE:
@@ -391,6 +398,14 @@ void nvme_complete_rq(struct request *req)
 	case FAILOVER:
 		nvme_failover_req(req);
 		return;
+	case AUTHENTICATE:
+#ifdef CONFIG_NVME_AUTH
+		queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+		nvme_retry_req(req);
+#else
+		nvme_end_req(req);
+#endif
+		return;
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -702,7 +717,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		switch (ctrl->state) {
 		case NVME_CTRL_CONNECTING:
 			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-			    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+			    (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
+			     req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 				return true;
 			break;
 		default:
@@ -3609,6 +3626,108 @@ static ssize_t dctype_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(dctype);
 
+#ifdef CONFIG_NVME_AUTH
+static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_secret);
+		opts->dhchap_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store);
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (!opts->dhchap_ctrl_secret)
+		return sysfs_emit(buf, "none\n");
+	return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret);
+}
+
+static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	char *dhchap_secret;
+
+	if (!ctrl->opts->dhchap_ctrl_secret)
+		return -EINVAL;
+	if (count < 7)
+		return -EINVAL;
+	if (memcmp(buf, "DHHC-1:", 7))
+		return -EINVAL;
+
+	dhchap_secret = kzalloc(count + 1, GFP_KERNEL);
+	if (!dhchap_secret)
+		return -ENOMEM;
+	memcpy(dhchap_secret, buf, count);
+	nvme_auth_stop(ctrl);
+	if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
+		int ret;
+
+		ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
+		if (ret)
+			return ret;
+		kfree(opts->dhchap_ctrl_secret);
+		opts->dhchap_ctrl_secret = dhchap_secret;
+		/* Key has changed; re-authentication with new key */
+		nvme_auth_reset(ctrl);
+	}
+	/* Start re-authentication */
+	dev_info(ctrl->device, "re-authenticating controller\n");
+	queue_work(nvme_wq, &ctrl->dhchap_auth_work);
+
+	return count;
+}
+static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
+	nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
+#endif
+
 static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_reset_controller.attr,
 	&dev_attr_rescan_controller.attr,
@@ -3632,6 +3751,10 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_kato.attr,
 	&dev_attr_cntrltype.attr,
 	&dev_attr_dctype.attr,
+#ifdef CONFIG_NVME_AUTH
+	&dev_attr_dhchap_secret.attr,
+	&dev_attr_dhchap_ctrl_secret.attr,
+#endif
 	NULL
 };
 
@@ -3655,6 +3778,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 		return 0;
 	if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
 		return 0;
+#ifdef CONFIG_NVME_AUTH
+	if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
+		return 0;
+#endif
 
 	return a->mode;
 }
@@ -4548,8 +4677,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
 		 * recovery actions from interfering with the controller's
 		 * firmware activation.
 		 */
-		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+			nvme_auth_stop(ctrl);
 			queue_work(nvme_wq, &ctrl->fw_act_work);
+		}
 		break;
 #ifdef CONFIG_NVME_MULTIPATH
 	case NVME_AER_NOTICE_ANA:
@@ -4613,6 +4744,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_mpath_stop(ctrl);
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
@@ -4672,6 +4804,8 @@ static void nvme_free_ctrl(struct device *dev)
 
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	nvme_auth_stop(ctrl);
+	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
 
 	if (subsys) {
@@ -4762,6 +4896,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
 	nvme_mpath_init_ctrl(ctrl);
+	nvme_auth_init_ctrl(ctrl);
 
 	return 0;
 out_free_name:
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index e4b1520862d8..5207a2348257 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -369,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	union nvme_result res;
 	struct nvmf_connect_data *data;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -401,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 		goto out_free_data;
 	}
 
-	ctrl->cntlid = le16_to_cpu(res.u16);
-
+	result = le32_to_cpu(res.u32);
+	ctrl->cntlid = result & 0xFFFF;
+	if ((result >> 16) & 0x3) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, 0);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid 0: authentication setup failed\n");
+			ret = NVME_SC_AUTH_REQUIRED;
+			goto out_free_data;
+		}
+		ret = nvme_auth_wait(ctrl, 0);
+		if (ret)
+			dev_warn(ctrl->device,
+				 "qid 0: authentication failed\n");
+		else
+			dev_info(ctrl->device,
+				 "qid 0: authenticated\n");
+	}
 out_free_data:
 	kfree(data);
 	return ret;
@@ -435,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	struct nvmf_connect_data *data;
 	union nvme_result res;
 	int ret;
+	u32 result;
 
 	cmd.connect.opcode = nvme_fabrics_command;
 	cmd.connect.fctype = nvme_fabrics_type_connect;
@@ -460,6 +479,21 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
 	}
+	result = le32_to_cpu(res.u32);
+	if ((result >> 16) & 2) {
+		/* Authentication required */
+		ret = nvme_auth_negotiate(ctrl, qid);
+		if (ret) {
+			dev_warn(ctrl->device,
+				 "qid %d: authentication setup failed\n", qid);
+			ret = NVME_SC_AUTH_REQUIRED;
+		} else {
+			ret = nvme_auth_wait(ctrl, qid);
+			if (ret)
+				dev_warn(ctrl->device,
+					 "qid %u: authentication failed\n", qid);
+		}
+	}
 	kfree(data);
 	return ret;
 }
@@ -552,6 +586,8 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TOS,			"tos=%d"		},
 	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
 	{ NVMF_OPT_DISCOVERY,		"discovery"		},
+	{ NVMF_OPT_DHCHAP_SECRET,	"dhchap_secret=%s"	},
+	{ NVMF_OPT_DHCHAP_CTRL_SECRET,	"dhchap_ctrl_secret=%s"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -833,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISCOVERY:
 			opts->discovery_nqn = true;
 			break;
+		case NVMF_OPT_DHCHAP_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_secret);
+			opts->dhchap_secret = p;
+			break;
+		case NVMF_OPT_DHCHAP_CTRL_SECRET:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) {
+				pr_err("Invalid DH-CHAP secret %s\n", p);
+				ret = -EINVAL;
+				goto out;
+			}
+			kfree(opts->dhchap_ctrl_secret);
+			opts->dhchap_ctrl_secret = p;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -951,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 	kfree(opts->subsysnqn);
 	kfree(opts->host_traddr);
 	kfree(opts->host_iface);
+	kfree(opts->dhchap_secret);
+	kfree(opts->dhchap_ctrl_secret);
 	kfree(opts);
 }
 EXPORT_SYMBOL_GPL(nvmf_free_options);
@@ -960,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
 				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
-				 NVMF_OPT_FAIL_FAST_TMO)
+				 NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
+				 NVMF_OPT_DHCHAP_CTRL_SECRET)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
@@ -1196,7 +1263,14 @@ static void __exit nvmf_exit(void)
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16);
+	BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 46d6e194ac2b..a6e22116e139 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -68,6 +68,8 @@ enum {
 	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 	NVMF_OPT_HOST_IFACE	= 1 << 21,
 	NVMF_OPT_DISCOVERY	= 1 << 22,
+	NVMF_OPT_DHCHAP_SECRET	= 1 << 23,
+	NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24,
 };
 
 /**
@@ -97,6 +99,9 @@ enum {
  * @max_reconnects: maximum number of allowed reconnect attempts before removing
  *              the controller, (-1) means reconnect forever, zero means remove
  *              immediately;
+ * @dhchap_secret: DH-HMAC-CHAP secret
+ * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional
+ *              authentication
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
@@ -121,6 +126,8 @@ struct nvmf_ctrl_options {
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
+	char			*dhchap_secret;
+	char			*dhchap_ctrl_secret;
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0085a3053e90..85929d70b776 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -328,6 +328,15 @@ struct nvme_ctrl {
 	struct work_struct ana_work;
 #endif
 
+#ifdef CONFIG_NVME_AUTH
+	struct work_struct dhchap_auth_work;
+	struct list_head dhchap_auth_list;
+	struct mutex dhchap_auth_mutex;
+	struct nvme_dhchap_key *host_key;
+	struct nvme_dhchap_key *ctrl_key;
+	u16 transaction;
+#endif
+
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
 	bool apst_enabled;
@@ -992,6 +1001,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
 	return ctrl->sgls & ((1 << 0) | (1 << 1));
 }
 
+#ifdef CONFIG_NVME_AUTH
+void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_auth_stop(struct nvme_ctrl *ctrl);
+int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
+int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
+void nvme_auth_reset(struct nvme_ctrl *ctrl);
+void nvme_auth_free(struct nvme_ctrl *ctrl);
+#else
+static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
+static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
+static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
+{
+	return -EPROTONOSUPPORT;
+}
+static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
+{
+	return NVME_SC_AUTH_REQUIRED;
+}
+static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
+#endif
+
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
 int nvme_execute_passthru_rq(struct request *rq);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4665aebd944d..7ecdeb58bd27 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1205,6 +1205,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	struct nvme_rdma_ctrl *ctrl = container_of(work,
 			struct nvme_rdma_ctrl, err_work);
 
+	nvme_auth_stop(&ctrl->ctrl);
 	nvme_stop_keep_alive(&ctrl->ctrl);
 	flush_work(&ctrl->ctrl.async_event_work);
 	nvme_rdma_teardown_io_queues(ctrl, false);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index b95ee85053e3..592ec078c5d7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2173,6 +2173,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
+	nvme_auth_stop(ctrl);
 	nvme_stop_keep_alive(ctrl);
 	flush_work(&ctrl->async_event_work);
 	nvme_tcp_teardown_io_queues(ctrl, false);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 2a89c5aa0790..1c36fcedea20 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
 	return ret;
 }
 
+static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 tl = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u",
+			 spsp0, spsp1, secp, tl);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 spsp0 = spc[1];
+	u8 spsp1 = spc[2];
+	u8 secp = spc[3];
+	u32 al = get_unaligned_le32(spc + 4);
+
+	trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u",
+			 spsp0, spsp1, secp, al);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
 static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
 		return nvme_trace_fabrics_connect(p, spc);
 	case nvme_fabrics_type_property_get:
 		return nvme_trace_fabrics_property_get(p, spc);
+	case nvme_fabrics_type_auth_send:
+		return nvme_trace_fabrics_auth_send(p, spc);
+	case nvme_fabrics_type_auth_receive:
+		return nvme_trace_fabrics_auth_receive(p, spc);
 	default:
 		return nvme_trace_fabrics_common(p, spc);
 	}
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
new file mode 100644
index 000000000000..354456826221
--- /dev/null
+++ b/include/linux/nvme-auth.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions
+ */
+
+#ifndef _NVME_AUTH_H
+#define _NVME_AUTH_H
+
+#include <crypto/kpp.h>
+
+struct nvme_dhchap_key {
+	u8 *key;
+	size_t len;
+	u8 hash;
+};
+
+u32 nvme_auth_get_seqnum(void);
+const char *nvme_auth_dhgroup_name(u8 dhgroup_id);
+const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id);
+u8 nvme_auth_dhgroup_id(const char *dhgroup_name);
+
+const char *nvme_auth_hmac_name(u8 hmac_id);
+const char *nvme_auth_digest_name(u8 hmac_id);
+size_t nvme_auth_hmac_hash_len(u8 hmac_id);
+u8 nvme_auth_hmac_id(const char *hmac_name);
+
+struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
+					      u8 key_hash);
+void nvme_auth_free_key(struct nvme_dhchap_key *key);
+u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
+int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+
+#endif /* _NVME_AUTH_H */
-- 
cgit v1.2.3


From b61775d185a395f26fecdc7898e39de677a6c3dd Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 27 Jun 2022 11:52:03 +0200
Subject: nvme-auth: Diffie-Hellman key exchange support

Implement Diffie-Hellman key exchange using FFDHE groups
for NVMe In-Band Authentication.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/common/auth.c | 153 ++++++++++++++++++++++++++++++++++
 drivers/nvme/host/Kconfig  |   2 +
 drivers/nvme/host/auth.c   | 201 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/nvme-auth.h  |   8 ++
 4 files changed, 358 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index adb43d341ffb..94cc7cafbb9d 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -295,6 +295,159 @@ out_free_key:
 }
 EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
 
+static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
+{
+	const char *digest_name;
+	struct crypto_shash *tfm;
+	int ret;
+
+	digest_name = nvme_auth_digest_name(hmac_id);
+	if (!digest_name) {
+		pr_debug("%s: failed to get digest for %d\n", __func__,
+			 hmac_id);
+		return -EINVAL;
+	}
+	tfm = crypto_alloc_shash(digest_name, 0, 0);
+	if (IS_ERR(tfm))
+		return -ENOMEM;
+
+	ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
+	if (ret < 0)
+		pr_debug("%s: Failed to hash digest len %zu\n", __func__,
+			 skey_len);
+
+	crypto_free_shash(tfm);
+	return ret;
+}
+
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+		u8 *challenge, u8 *aug, size_t hlen)
+{
+	struct crypto_shash *tfm;
+	struct shash_desc *desc;
+	u8 *hashed_key;
+	const char *hmac_name;
+	int ret;
+
+	hashed_key = kmalloc(hlen, GFP_KERNEL);
+	if (!hashed_key)
+		return -ENOMEM;
+
+	ret = nvme_auth_hash_skey(hmac_id, skey,
+				  skey_len, hashed_key);
+	if (ret < 0)
+		goto out_free_key;
+
+	hmac_name = nvme_auth_hmac_name(hmac_id);
+	if (!hmac_name) {
+		pr_warn("%s: invalid hash algoritm %d\n",
+			__func__, hmac_id);
+		ret = -EINVAL;
+		goto out_free_key;
+	}
+
+	tfm = crypto_alloc_shash(hmac_name, 0, 0);
+	if (IS_ERR(tfm)) {
+		ret = PTR_ERR(tfm);
+		goto out_free_key;
+	}
+
+	desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+		       GFP_KERNEL);
+	if (!desc) {
+		ret = -ENOMEM;
+		goto out_free_hash;
+	}
+	desc->tfm = tfm;
+
+	ret = crypto_shash_setkey(tfm, hashed_key, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_init(desc);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_update(desc, challenge, hlen);
+	if (ret)
+		goto out_free_desc;
+
+	ret = crypto_shash_final(desc, aug);
+out_free_desc:
+	kfree_sensitive(desc);
+out_free_hash:
+	crypto_free_shash(tfm);
+out_free_key:
+	kfree_sensitive(hashed_key);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
+
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid)
+{
+	int ret;
+
+	ret = crypto_kpp_set_secret(dh_tfm, NULL, 0);
+	if (ret)
+		pr_debug("failed to set private key, error %d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey);
+
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+		u8 *host_key, size_t host_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	kpp_request_set_input(req, NULL, 0);
+	sg_init_one(&dst, host_key, host_key_len);
+	kpp_request_set_output(req, &dst, host_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait);
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
+
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+		u8 *ctrl_key, size_t ctrl_key_len,
+		u8 *sess_key, size_t sess_key_len)
+{
+	struct kpp_request *req;
+	struct crypto_wait wait;
+	struct scatterlist src, dst;
+	int ret;
+
+	req = kpp_request_alloc(dh_tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	crypto_init_wait(&wait);
+	sg_init_one(&src, ctrl_key, ctrl_key_len);
+	kpp_request_set_input(req, &src, ctrl_key_len);
+	sg_init_one(&dst, sess_key, sess_key_len);
+	kpp_request_set_output(req, &dst, sess_key_len);
+	kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				 crypto_req_done, &wait);
+
+	ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait);
+
+	kpp_request_free(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
+
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
 {
 	struct nvme_dhchap_key *key;
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 6c503f42f3c6..2f6a7f8c94e8 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -100,6 +100,8 @@ config NVME_AUTH
 	select CRYPTO_HMAC
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
+	select CRYPTO_DH
+	select CRYPTO_DH_RFC7919_GROUPS
 	help
 	  This provides support for NVMe over Fabrics In-Band Authentication.
 
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 9766bfffecac..c8a6db7c4498 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -18,6 +18,7 @@ struct nvme_dhchap_queue_context {
 	struct work_struct auth_work;
 	struct nvme_ctrl *ctrl;
 	struct crypto_shash *shash_tfm;
+	struct crypto_kpp *dh_tfm;
 	void *buf;
 	size_t buf_size;
 	int qid;
@@ -33,6 +34,12 @@ struct nvme_dhchap_queue_context {
 	u8 c2[64];
 	u8 response[64];
 	u8 *host_response;
+	u8 *ctrl_key;
+	int ctrl_key_len;
+	u8 *host_key;
+	int host_key_len;
+	u8 *sess_key;
+	int sess_key_len;
 };
 
 #define nvme_auth_flags_from_qid(qid) \
@@ -137,6 +144,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
 	struct nvmf_auth_dhchap_challenge_data *data = chap->buf;
 	u16 dhvlen = le16_to_cpu(data->dhvlen);
 	size_t size = sizeof(*data) + data->hl + dhvlen;
+	const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
 	const char *hmac_name, *kpp_name;
 
 	if (chap->buf_size < size) {
@@ -207,15 +215,54 @@ select_kpp:
 			 "qid %d: invalid DH group id %d\n",
 			 chap->qid, data->dhgid);
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+		/* Leave previous dh_tfm intact */
 		return NVME_SC_AUTH_REQUIRED;
 	}
 
+	/* Clear host and controller key to avoid accidental reuse */
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+
+	if (chap->dhgroup_id == data->dhgid &&
+	    (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
+		dev_dbg(ctrl->device,
+			"qid %d: reuse existing DH group %s\n",
+			chap->qid, gid_name);
+		goto skip_kpp;
+	}
+
+	/* Reset dh_tfm if it can't be reused */
+	if (chap->dh_tfm) {
+		crypto_free_kpp(chap->dh_tfm);
+		chap->dh_tfm = NULL;
+	}
+
 	if (data->dhgid != NVME_AUTH_DHGROUP_NULL) {
-		dev_warn(ctrl->device,
-			 "qid %d: unsupported DH group %s\n",
-			 chap->qid, kpp_name);
-		chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
-		return NVME_SC_AUTH_REQUIRED;
+		if (dhvlen == 0) {
+			dev_warn(ctrl->device,
+				 "qid %d: empty DH value\n",
+				 chap->qid);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			return NVME_SC_INVALID_FIELD;
+		}
+
+		chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0);
+		if (IS_ERR(chap->dh_tfm)) {
+			int ret = PTR_ERR(chap->dh_tfm);
+
+			dev_warn(ctrl->device,
+				 "qid %d: error %d initializing DH group %s\n",
+				 chap->qid, ret, gid_name);
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE;
+			chap->dh_tfm = NULL;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		dev_dbg(ctrl->device, "qid %d: selected DH group %s\n",
+			chap->qid, gid_name);
 	} else if (dhvlen != 0) {
 		dev_warn(ctrl->device,
 			 "qid %d: invalid DH value for NULL DH\n",
@@ -225,8 +272,21 @@ select_kpp:
 	}
 	chap->dhgroup_id = data->dhgid;
 
+skip_kpp:
 	chap->s1 = le32_to_cpu(data->seqnum);
 	memcpy(chap->c1, data->cval, chap->hash_len);
+	if (dhvlen) {
+		chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL);
+		if (!chap->ctrl_key) {
+			chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+			return NVME_SC_AUTH_REQUIRED;
+		}
+		chap->ctrl_key_len = dhvlen;
+		memcpy(chap->ctrl_key, data->cval + chap->hash_len,
+		       dhvlen);
+		dev_dbg(ctrl->device, "ctrl public key %*ph\n",
+			 (int)chap->ctrl_key_len, chap->ctrl_key);
+	}
 
 	return 0;
 }
@@ -239,6 +299,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 
 	size += 2 * chap->hash_len;
 
+	if (chap->host_key_len)
+		size += chap->host_key_len;
+
 	if (chap->buf_size < size) {
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
 		return -EINVAL;
@@ -249,7 +312,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 	data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY;
 	data->t_id = cpu_to_le16(chap->transaction);
 	data->hl = chap->hash_len;
-	data->dhvlen = 0;
+	data->dhvlen = cpu_to_le16(chap->host_key_len);
 	memcpy(data->rval, chap->response, chap->hash_len);
 	if (ctrl->ctrl_key) {
 		get_random_bytes(chap->c2, chap->hash_len);
@@ -264,6 +327,14 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
 		chap->s2 = 0;
 	}
 	data->seqnum = cpu_to_le32(chap->s2);
+	if (chap->host_key_len) {
+		dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n",
+			__func__, chap->qid,
+			chap->host_key_len, chap->host_key);
+		memcpy(data->rval + 2 * chap->hash_len, chap->host_key,
+		       chap->host_key_len);
+	}
+
 	return size;
 }
 
@@ -381,6 +452,21 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 		goto out;
 	}
 
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c1, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
+
 	shash->tfm = chap->shash_tfm;
 	ret = crypto_shash_init(shash);
 	if (ret)
@@ -443,6 +529,20 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		goto out;
 	}
 
+	if (chap->dh_tfm) {
+		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
+		if (!challenge) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = nvme_auth_augmented_challenge(chap->hash_id,
+						    chap->sess_key,
+						    chap->sess_key_len,
+						    chap->c2, challenge,
+						    chap->hash_len);
+		if (ret)
+			goto out;
+	}
 	dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n",
 		__func__, chap->qid, chap->s2, chap->transaction);
 	dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n",
@@ -492,8 +592,81 @@ out:
 	return ret;
 }
 
+static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
+		struct nvme_dhchap_queue_context *chap)
+{
+	int ret;
+
+	if (chap->host_key && chap->host_key_len) {
+		dev_dbg(ctrl->device,
+			"qid %d: reusing host key\n", chap->qid);
+		goto gen_sesskey;
+	}
+	ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id);
+	if (ret < 0) {
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+	chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm);
+
+	chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL);
+	if (!chap->host_key) {
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+	ret = nvme_auth_gen_pubkey(chap->dh_tfm,
+				   chap->host_key, chap->host_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate public key, error %d\n", ret);
+		kfree(chap->host_key);
+		chap->host_key = NULL;
+		chap->host_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+
+gen_sesskey:
+	chap->sess_key_len = chap->host_key_len;
+	chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL);
+	if (!chap->sess_key) {
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
+		return -ENOMEM;
+	}
+
+	ret = nvme_auth_gen_shared_secret(chap->dh_tfm,
+					  chap->ctrl_key, chap->ctrl_key_len,
+					  chap->sess_key, chap->sess_key_len);
+	if (ret) {
+		dev_dbg(ctrl->device,
+			"failed to generate shared secret, error %d\n", ret);
+		kfree_sensitive(chap->sess_key);
+		chap->sess_key = NULL;
+		chap->sess_key_len = 0;
+		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+		return ret;
+	}
+	dev_dbg(ctrl->device, "shared secret %*ph\n",
+		(int)chap->sess_key_len, chap->sess_key);
+	return 0;
+}
+
 static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
 {
+	kfree_sensitive(chap->host_response);
+	chap->host_response = NULL;
+	kfree_sensitive(chap->host_key);
+	chap->host_key = NULL;
+	chap->host_key_len = 0;
+	kfree_sensitive(chap->ctrl_key);
+	chap->ctrl_key = NULL;
+	chap->ctrl_key_len = 0;
+	kfree_sensitive(chap->sess_key);
+	chap->sess_key = NULL;
+	chap->sess_key_len = 0;
 	chap->status = 0;
 	chap->error = 0;
 	chap->s1 = 0;
@@ -508,6 +681,11 @@ static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
 	__nvme_auth_reset(chap);
 	if (chap->shash_tfm)
 		crypto_free_shash(chap->shash_tfm);
+	if (chap->dh_tfm)
+		crypto_free_kpp(chap->dh_tfm);
+	kfree_sensitive(chap->ctrl_key);
+	kfree_sensitive(chap->host_key);
+	kfree_sensitive(chap->sess_key);
 	kfree_sensitive(chap->host_response);
 	kfree(chap->buf);
 	kfree(chap);
@@ -566,6 +744,17 @@ static void __nvme_auth_work(struct work_struct *work)
 		goto fail2;
 	}
 
+	if (chap->ctrl_key_len) {
+		dev_dbg(ctrl->device,
+			"%s: qid %d DH exponential\n",
+			__func__, chap->qid);
+		ret = nvme_auth_dhchap_exponential(ctrl, chap);
+		if (ret) {
+			chap->error = ret;
+			goto fail2;
+		}
+	}
+
 	dev_dbg(ctrl->device, "%s: qid %d host response\n",
 		__func__, chap->qid);
 	ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h
index 354456826221..dcb8030062dd 100644
--- a/include/linux/nvme-auth.h
+++ b/include/linux/nvme-auth.h
@@ -29,5 +29,13 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 void nvme_auth_free_key(struct nvme_dhchap_key *key);
 u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn);
 int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key);
+int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
+				  u8 *challenge, u8 *aug, size_t hlen);
+int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid);
+int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
+			 u8 *host_key, size_t host_key_len);
+int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
+				u8 *ctrl_key, size_t ctrl_key_len,
+				u8 *sess_key, size_t sess_key_len);
 
 #endif /* _NVME_AUTH_H */
-- 
cgit v1.2.3


From 5a97806f7dc069d9561d9930a2ae108700e222ab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Jul 2022 12:22:55 -0400
Subject: block: change the blk_queue_split calling convention

The double indirect bio leads to somewhat suboptimal code generation.
Instead return the (original or split) bio, and make sure the
request_queue arguments to the lower level helpers is passed after the
bio to avoid constant reshuffling of the argument passing registers.

Also give it and the helpers used to implement it more descriptive names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220727162300.3089193-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c             | 98 +++++++++++++++++++++----------------------
 block/blk-mq.c                |  4 +-
 block/blk.h                   |  6 +--
 drivers/block/drbd/drbd_req.c |  2 +-
 drivers/block/pktcdvd.c       |  2 +-
 drivers/block/ps3vram.c       |  2 +-
 drivers/md/dm.c               |  6 +--
 drivers/md/md.c               |  2 +-
 drivers/nvme/host/multipath.c |  2 +-
 drivers/s390/block/dcssblk.c  |  2 +-
 include/linux/blkdev.h        |  2 +-
 11 files changed, 63 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4c8a699754c9..6e29fb28584e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -95,10 +95,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 	return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
-static struct bio *blk_bio_discard_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *nsegs)
+static struct bio *bio_split_discard(struct bio *bio, struct request_queue *q,
+		unsigned *nsegs, struct bio_set *bs)
 {
 	unsigned int max_discard_sectors, granularity;
 	int alignment;
@@ -139,8 +137,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
-		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+		struct request_queue *q, unsigned *nsegs, struct bio_set *bs)
 {
 	*nsegs = 0;
 
@@ -161,8 +159,8 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
  * requests that are submitted to a block device if the start of a bio is not
  * aligned to a physical block boundary.
  */
-static inline unsigned get_max_io_size(struct request_queue *q,
-				       struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+		struct request_queue *q)
 {
 	unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
 	unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
@@ -247,16 +245,16 @@ static bool bvec_split_segs(const struct request_queue *q,
 }
 
 /**
- * blk_bio_segment_split - split a bio in two bios
- * @q:    [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
  * @bio:  [in] bio to be split
- * @bs:	  [in] bio set to allocate the clone from
+ * @q:    [in] request queue pointer
  * @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs:	  [in] bio set to allocate the clone from
  *
  * Clone @bio, update the bi_iter of the clone to represent the first sectors
  * of @bio and update @bio->bi_iter to represent the remaining sectors. The
  * following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most get_max_io_size(@bio, @q) sectors.
  * - That it has at most queue_max_segments(@q) segments.
  *
  * Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -265,15 +263,13 @@ static bool bvec_split_segs(const struct request_queue *q,
  * responsible for ensuring that @bs is only destroyed after processing of the
  * split bio has finished.
  */
-static struct bio *blk_bio_segment_split(struct request_queue *q,
-					 struct bio *bio,
-					 struct bio_set *bs,
-					 unsigned *segs)
+static struct bio *bio_split_rw(struct bio *bio, struct request_queue *q,
+		unsigned *segs, struct bio_set *bs)
 {
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
-	const unsigned max_bytes = get_max_io_size(q, bio) << 9;
+	const unsigned max_bytes = get_max_io_size(bio, q) << 9;
 	const unsigned max_segs = queue_max_segments(q);
 
 	bio_for_each_bvec(bv, bio, iter) {
@@ -320,34 +316,33 @@ split:
 }
 
 /**
- * __blk_queue_split - split a bio and submit the second half
- * @q:       [in] request_queue new bio is being queued at
- * @bio:     [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
+ * @q:       request_queue new bio is being queued at
+ * @nr_segs: returns the number of segments in the returned bio
+ *
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it.  @bio is
+ * shortened to the remainder and re-submitted.
  *
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from q->bio_split, it is the responsibility
- * of the caller to ensure that q->bio_split is only released after processing
- * of the split bio has finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
+struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
 		       unsigned int *nr_segs)
 {
-	struct bio *split = NULL;
+	struct bio *split;
 
-	switch (bio_op(*bio)) {
+	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_discard(bio, q, nr_segs, &q->bio_split);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
-				nr_segs);
+		split = bio_split_write_zeroes(bio, q, nr_segs, &q->bio_split);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+		split = bio_split_rw(bio, q, nr_segs, &q->bio_split);
 		break;
 	}
 
@@ -356,32 +351,35 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
 		split->bi_opf |= REQ_NOMERGE;
 
 		blkcg_bio_issue_init(split);
-		bio_chain(split, *bio);
-		trace_block_split(split, (*bio)->bi_iter.bi_sector);
-		submit_bio_noacct(*bio);
-		*bio = split;
+		bio_chain(split, bio);
+		trace_block_split(split, bio->bi_iter.bi_sector);
+		submit_bio_noacct(bio);
+		return split;
 	}
+	return bio;
 }
 
 /**
- * blk_queue_split - split a bio and submit the second half
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     bio to be split
+ *
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it.  @bio is shortened to the remainder and re-submitted.
  *
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from q->bio_split, it is the responsibility of the caller to ensure
- * that q->bio_split is only released after processing of the split bio has
- * finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
  */
-void blk_queue_split(struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned int nr_segs;
 
-	if (blk_may_split(q, *bio))
-		__blk_queue_split(q, bio, &nr_segs);
+	if (bio_may_exceed_limits(bio, q))
+		return __bio_split_to_limits(bio, q, &nr_segs);
+	return bio;
 }
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
 
 unsigned int blk_recalc_rq_segments(struct request *rq)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 70177ee74295..790f55453f1b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2816,8 +2816,8 @@ void blk_mq_submit_bio(struct bio *bio)
 	blk_status_t ret;
 
 	blk_queue_bounce(q, &bio);
-	if (blk_may_split(q, bio))
-		__blk_queue_split(q, &bio, &nr_segs);
+	if (bio_may_exceed_limits(bio, q))
+		bio = __bio_split_to_limits(bio, q, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
 		return;
diff --git a/block/blk.h b/block/blk.h
index 1d83b1d41cd1..623be4c2e60c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -288,7 +288,7 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
 				const char *, size_t);
 
-static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
+static inline bool bio_may_exceed_limits(struct bio *bio, struct request_queue *q)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
@@ -311,8 +311,8 @@ static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 		bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
 }
 
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
-			unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
+		       unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 6d8dd14458c6..8f7f144e54f3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1608,7 +1608,7 @@ void drbd_submit_bio(struct bio *bio)
 {
 	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	/*
 	 * what we "blindly" assume:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 01a15dbd9cde..4cea3b08087e 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_submit_bio(struct bio *bio)
 	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
 	struct bio *split;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_iter.bi_sector,
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index d1e0fefec90b..e1d080f680ed 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -586,7 +586,7 @@ static void ps3vram_submit_bio(struct bio *bio)
 
 	dev_dbg(&dev->core, "%s\n", __func__);
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	spin_lock_irq(&priv->lock);
 	busy = !bio_list_empty(&priv->list);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 99642f69bfa7..fbcffcfb2304 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1181,7 +1181,7 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 	 * Does the target need to split IO even further?
 	 * - varied (per target) IO splitting is a tenet of DM; this
 	 *   explains why stacked chunk_sectors based splitting via
-	 *   blk_queue_split() isn't possible here.
+	 *   bio_split_to_limits() isn't possible here.
 	 */
 	if (!ti->max_io_len)
 		return len;
@@ -1751,10 +1751,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	is_abnormal = is_abnormal_io(bio);
 	if (unlikely(is_abnormal)) {
 		/*
-		 * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
 		 * otherwise associated queue_limits won't be imposed.
 		 */
-		blk_queue_split(&bio);
+		bio = bio_split_to_limits(bio);
 	}
 
 	init_clone_info(&ci, md, map, bio, is_abnormal);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 35b895813c88..afaf36b2f6ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -442,7 +442,7 @@ static void md_submit_bio(struct bio *bio)
 		return;
 	}
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		if (bio_sectors(bio) != 0)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 64f44d98daaf..6ef497c75a16 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -346,7 +346,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	 * different queue via blk_steal_bios(), so we need to use the bio_split
 	 * pool from the original queue to allocate the bvecs from.
 	 */
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = nvme_find_path(head);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4d8d1759775a..5187705bd0f3 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -863,7 +863,7 @@ dcssblk_submit_bio(struct bio *bio)
 	unsigned long source_addr;
 	unsigned long bytes_done;
 
-	blk_queue_split(&bio);
+	bio = bio_split_to_limits(bio);
 
 	bytes_done = 0;
 	dev_info = bio->bi_bdev->bd_disk->private_data;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d04bdf549efa..5eef8d2eddc1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -864,9 +864,9 @@ void blk_request_module(dev_t devt);
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 void submit_bio_noacct(struct bio *bio);
+struct bio *bio_split_to_limits(struct bio *bio);
 
 extern int blk_lld_busy(struct request_queue *q);
-extern void blk_queue_split(struct bio **);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-- 
cgit v1.2.3


From 46754bd056053785d7079f1d48f7f571728dcb47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 27 Jul 2022 12:22:57 -0400
Subject: block: move ->bio_split to the gendisk

Only non-passthrough requests are split by the block layer and use the
->bio_split bio_set.  Move it from the request_queue to the gendisk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20220727162300.3089193-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 9 +--------
 block/blk-merge.c      | 7 ++++---
 block/blk-sysfs.c      | 2 --
 block/genhd.c          | 8 +++++++-
 drivers/md/dm.c        | 2 +-
 include/linux/blkdev.h | 3 ++-
 6 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 3d286a256d3d..a0d1104c5590 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -377,7 +377,6 @@ static void blk_timeout_work(struct work_struct *work)
 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
 			GFP_KERNEL | __GFP_ZERO, node_id);
@@ -396,13 +395,9 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 	if (q->id < 0)
 		goto fail_srcu;
 
-	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
-	if (ret)
-		goto fail_id;
-
 	q->stats = blk_alloc_queue_stats();
 	if (!q->stats)
-		goto fail_split;
+		goto fail_id;
 
 	q->node = node_id;
 
@@ -439,8 +434,6 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 
 fail_stats:
 	blk_free_queue_stats(q->stats);
-fail_split:
-	bioset_exit(&q->bio_split);
 fail_id:
 	ida_free(&blk_queue_ida, q->id);
 fail_srcu:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6e29fb28584e..30872a353764 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -331,18 +331,19 @@ split:
 struct bio *__bio_split_to_limits(struct bio *bio, struct request_queue *q,
 		       unsigned int *nr_segs)
 {
+	struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
 	struct bio *split;
 
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = bio_split_discard(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_discard(bio, q, nr_segs, bs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = bio_split_write_zeroes(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_write_zeroes(bio, q, nr_segs, bs);
 		break;
 	default:
-		split = bio_split_rw(bio, q, nr_segs, &q->bio_split);
+		split = bio_split_rw(bio, q, nr_segs, bs);
 		break;
 	}
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c0303026752d..e1f009aba6fd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -779,8 +779,6 @@ static void blk_release_queue(struct kobject *kobj)
 	if (queue_is_mq(q))
 		blk_mq_release(q);
 
-	bioset_exit(&q->bio_split);
-
 	if (blk_queue_has_srcu(q))
 		cleanup_srcu_struct(q->srcu);
 
diff --git a/block/genhd.c b/block/genhd.c
index e1d5b10ac193..b901fea1d55a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1151,6 +1151,7 @@ static void disk_release(struct device *dev)
 		blk_mq_exit_queue(disk->queue);
 
 	blkcg_exit_queue(disk->queue);
+	bioset_exit(&disk->bio_split);
 
 	disk_release_events(disk);
 	kfree(disk->random);
@@ -1342,9 +1343,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (!disk)
 		goto out_put_queue;
 
+	if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+		goto out_free_disk;
+
 	disk->bdi = bdi_alloc(node_id);
 	if (!disk->bdi)
-		goto out_free_disk;
+		goto out_free_bioset;
 
 	/* bdev_alloc() might need the queue, set before the first call */
 	disk->queue = q;
@@ -1382,6 +1386,8 @@ out_destroy_part_tbl:
 	iput(disk->part0->bd_inode);
 out_free_bdi:
 	bdi_put(disk->bdi);
+out_free_bioset:
+	bioset_exit(&disk->bio_split);
 out_free_disk:
 	kfree(disk);
 out_put_queue:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index fbcffcfb2304..28bd4a35b86b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,7 +1016,7 @@ static void dm_wq_requeue_work(struct work_struct *work)
 	while (io) {
 		struct dm_io *next = io->next;
 
-		dm_io_rewind(io, &md->queue->bio_split);
+		dm_io_rewind(io, &md->disk->bio_split);
 
 		io->next = NULL;
 		__dm_io_complete(io, false);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5eef8d2eddc1..49dcd31e283e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -140,6 +140,8 @@ struct gendisk {
 	struct request_queue *queue;
 	void *private_data;
 
+	struct bio_set bio_split;
+
 	int flags;
 	unsigned long state;
 #define GD_NEED_PART_SCAN		0
@@ -531,7 +533,6 @@ struct request_queue {
 
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
-	struct bio_set		bio_split;
 
 	struct dentry		*debugfs_dir;
 	struct dentry		*sched_debugfs_dir;
-- 
cgit v1.2.3


From cb3b3bf22cf33707d684e74207908ba0ef3b6467 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:48 +0200
Subject: jbd2: rename jbd_debug() to jbd2_debug()

The name of jbd_debug() is confusing as all functions inside jbd2 have
jbd2_ prefix. Rename jbd_debug() to jbd2_debug(). No functional changes.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c  |  6 +++---
 fs/jbd2/commit.c      | 30 +++++++++++++++---------------
 fs/jbd2/journal.c     | 34 +++++++++++++++++-----------------
 fs/jbd2/recovery.c    | 30 +++++++++++++++---------------
 fs/jbd2/revoke.c      |  8 ++++----
 fs/jbd2/transaction.c | 26 +++++++++++++-------------
 include/linux/jbd2.h  |  4 ++--
 7 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 746132998c57..51bd38da21cd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	tid_t			this_tid;
 	int			result, batch_count = 0;
 
-	jbd_debug(1, "Start checkpoint\n");
+	jbd2_debug(1, "Start checkpoint\n");
 
 	/*
 	 * First thing: if there are any transactions in the log which
@@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
 	trace_jbd2_checkpoint(journal, result);
-	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
 
@@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	trace_jbd2_drop_transaction(journal, transaction);
 
-	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index eb315e81f1a6..aa14f20241d7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
-		jbd_debug(3, "super block updated\n");
+		jbd2_debug(3, "super block updated\n");
 		mutex_lock_io(&journal->j_checkpoint_mutex);
 		/*
 		 * We hold j_checkpoint_mutex so tail cannot change under us.
@@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
-		jbd_debug(3, "superblock not updated\n");
+		jbd2_debug(3, "superblock not updated\n");
 	}
 
 	J_ASSERT(journal->j_running_transaction != NULL);
@@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 
 	trace_jbd2_start_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+	jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
@@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	__jbd2_journal_clean_checkpoint_list(journal, false);
 	spin_unlock(&journal->j_list_lock);
 
-	jbd_debug(3, "JBD2: commit phase 1\n");
+	jbd2_debug(3, "JBD2: commit phase 1\n");
 
 	/*
 	 * Clear revoked flag to reflect there is no revoked buffers
@@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug(3, "JBD2: commit phase 2a\n");
+	jbd2_debug(3, "JBD2: commit phase 2a\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 
-	jbd_debug(3, "JBD2: commit phase 2b\n");
+	jbd2_debug(3, "JBD2: commit phase 2b\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (!descriptor) {
 			J_ASSERT (bufs == 0);
 
-			jbd_debug(4, "JBD2: get descriptor\n");
+			jbd2_debug(4, "JBD2: get descriptor\n");
 
 			descriptor = jbd2_journal_get_descriptor_buffer(
 							commit_transaction,
@@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				continue;
 			}
 
-			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+			jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
 				(unsigned long long)descriptor->b_blocknr,
 				descriptor->b_data);
 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		    commit_transaction->t_buffers == NULL ||
 		    space_left < tag_bytes + 16 + csum_size) {
 
-			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+			jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
 			/* Write an end-of-descriptor marker before
                            submitting the IOs.  "tag" still points to
@@ -839,7 +839,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD2: commit phase 3\n");
+	jbd2_debug(3, "JBD2: commit phase 3\n");
 
 	while (!list_empty(&io_bufs)) {
 		struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -882,7 +882,7 @@ start_journal_io:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD2: commit phase 4\n");
+	jbd2_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
 	while (!list_empty(&log_bufs)) {
@@ -906,7 +906,7 @@ start_journal_io:
 	if (err)
 		jbd2_journal_abort(journal, err);
 
-	jbd_debug(3, "JBD2: commit phase 5\n");
+	jbd2_debug(3, "JBD2: commit phase 5\n");
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 	commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -945,7 +945,7 @@ start_journal_io:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD2: commit phase 6\n");
+	jbd2_debug(3, "JBD2: commit phase 6\n");
 
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -1122,7 +1122,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD2: commit phase 7\n");
+	jbd2_debug(3, "JBD2: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 
@@ -1164,7 +1164,7 @@ restart_loop:
 		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
-	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+	jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
 	write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0cbeeaec2d1..0a8ff211fac1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -203,11 +203,11 @@ loop:
 	if (journal->j_flags & JBD2_UNMOUNT)
 		goto end_loop;
 
-	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
+	jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
 		journal->j_commit_sequence, journal->j_commit_request);
 
 	if (journal->j_commit_sequence != journal->j_commit_request) {
-		jbd_debug(1, "OK, requests differ\n");
+		jbd2_debug(1, "OK, requests differ\n");
 		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
 		jbd2_journal_commit_transaction(journal);
@@ -222,7 +222,7 @@ loop:
 		 * good idea, because that depends on threads that may
 		 * be already stopped.
 		 */
-		jbd_debug(1, "Now suspending kjournald2\n");
+		jbd2_debug(1, "Now suspending kjournald2\n");
 		write_unlock(&journal->j_state_lock);
 		try_to_freeze();
 		write_lock(&journal->j_state_lock);
@@ -252,7 +252,7 @@ loop:
 		finish_wait(&journal->j_wait_commit, &wait);
 	}
 
-	jbd_debug(1, "kjournald2 wakes\n");
+	jbd2_debug(1, "kjournald2 wakes\n");
 
 	/*
 	 * Were we woken up by a commit wakeup event?
@@ -260,7 +260,7 @@ loop:
 	transaction = journal->j_running_transaction;
 	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 		journal->j_commit_request = transaction->t_tid;
-		jbd_debug(1, "woke because of timeout\n");
+		jbd2_debug(1, "woke because of timeout\n");
 	}
 	goto loop;
 
@@ -268,7 +268,7 @@ end_loop:
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
-	jbd_debug(1, "Journal thread exiting.\n");
+	jbd2_debug(1, "Journal thread exiting.\n");
 	write_unlock(&journal->j_state_lock);
 	return 0;
 }
@@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		 */
 
 		journal->j_commit_request = target;
-		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
+		jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
 		journal->j_running_transaction->t_requested = jiffies;
@@ -705,7 +705,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
+		jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 				  tid, journal->j_commit_sequence);
 		read_unlock(&journal->j_state_lock);
 		wake_up(&journal->j_wait_commit);
@@ -1117,7 +1117,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 		freed += journal->j_last - journal->j_first;
 
 	trace_jbd2_update_log_tail(journal, tid, block, freed);
-	jbd_debug(1,
+	jbd2_debug(1,
 		  "Cleaning journal tail from %u to %u (offset %lu), "
 		  "freeing %lu\n",
 		  journal->j_tail_sequence, tid, block, freed);
@@ -1496,7 +1496,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
 		return NULL;
 	}
 
-	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+	jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
 		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
 
@@ -1575,7 +1575,7 @@ static int journal_reset(journal_t *journal)
 	 * attempting a write to a potential-readonly device.
 	 */
 	if (sb->s_start == 0) {
-		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+		jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
 			"(start %ld, seq %u, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
@@ -1678,7 +1678,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	}
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+	jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
 		  tail_block, tail_tid);
 
 	lock_buffer(journal->j_sb_buffer);
@@ -1719,7 +1719,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 		return;
 	}
 
-	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
+	jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
 		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1862,7 +1862,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	errcode = journal->j_errno;
 	if (errcode == -ESHUTDOWN)
 		errcode = 0;
-	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+	jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
 	sb->s_errno    = cpu_to_be32(errcode);
 
 	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
@@ -2334,7 +2334,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
 	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
 		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
 
-	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2403,7 +2403,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
 {
 	journal_superblock_t *sb;
 
-	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+	jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
 		  compat, ro, incompat);
 
 	sb = journal->j_superblock;
@@ -2860,7 +2860,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 #endif
 	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
-		jbd_debug(1, "out of memory for journal_head\n");
+		jbd2_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
 				GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 8ca3527189f8..63594030afd3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal,
 		return 0;
 
 	while (next_fc_block <= journal->j_fc_last) {
-		jbd_debug(3, "Fast commit replay: next block %ld\n",
+		jbd2_debug(3, "Fast commit replay: next block %ld\n",
 			  next_fc_block);
 		err = jread(&bh, journal, next_fc_block);
 		if (err) {
-			jbd_debug(3, "Fast commit replay: read error\n");
+			jbd2_debug(3, "Fast commit replay: read error\n");
 			break;
 		}
 
@@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal,
 	}
 
 	if (err)
-		jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+		jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
 
 	return err;
 }
@@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal)
 	 */
 
 	if (!sb->s_start) {
-		jbd_debug(1, "No recovery required, last transaction %d\n",
+		jbd2_debug(1, "No recovery required, last transaction %d\n",
 			  be32_to_cpu(sb->s_sequence));
 		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
 		return 0;
@@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal)
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
 
-	jbd_debug(1, "JBD2: recovery, exit status %d, "
+	jbd2_debug(1, "JBD2: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+	jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
 	/* Restart the log at the next transaction ID, thus invalidating
@@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD2_DEBUG
 		int dropped = info.end_transaction - 
 			be32_to_cpu(journal->j_superblock->s_sequence);
-		jbd_debug(1,
+		jbd2_debug(1,
 			  "JBD2: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 #endif
@@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal,
 	if (pass == PASS_SCAN)
 		info->start_transaction = first_commit_ID;
 
-	jbd_debug(1, "Starting recovery pass %d\n", pass);
+	jbd2_debug(1, "Starting recovery pass %d\n", pass);
 
 	/*
 	 * Now we walk through the log, transaction by transaction,
@@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal,
 			if (tid_geq(next_commit_ID, info->end_transaction))
 				break;
 
-		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+		jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
 			  next_commit_ID, next_log_block,
 			  jbd2_has_feature_fast_commit(journal) ?
 			  journal->j_fc_last : journal->j_last);
@@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal,
 		 * either the next descriptor block or the final commit
 		 * record. */
 
-		jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+		jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
 		err = jread(&bh, journal, next_log_block);
 		if (err)
 			goto failed;
@@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal,
 
 		blocktype = be32_to_cpu(tmp->h_blocktype);
 		sequence = be32_to_cpu(tmp->h_sequence);
-		jbd_debug(3, "Found magic %d, sequence %d\n",
+		jbd2_debug(3, "Found magic %d, sequence %d\n",
 			  blocktype, sequence);
 
 		if (sequence != next_commit_ID) {
@@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal,
 					goto failed;
 				}
 				need_check_commit_time = true;
-				jbd_debug(1,
+				jbd2_debug(1,
 					"invalid descriptor block found in %lu\n",
 					next_log_block);
 			}
@@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal,
 				 * It likely does not belong to same journal,
 				 * just end this recovery with success.
 				 */
-				jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+				jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
 					  next_commit_ID);
 				brelse(bh);
 				goto done;
@@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal,
 			if (pass == PASS_SCAN &&
 			    !jbd2_descriptor_block_csum_verify(journal,
 							       bh->b_data)) {
-				jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+				jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
 					  next_log_block);
 				need_check_commit_time = true;
 			}
@@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal,
 			continue;
 
 		default:
-			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+			jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
 			brelse(bh);
 			goto done;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4556e4689024 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
 	}
 	handle->h_revoke_credits--;
 
-	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
 	err = insert_revoke_hash(journal, blocknr,
 				handle->h_transaction->t_tid);
 	BUFFER_TRACE(bh_in, "exit");
@@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	int did_revoke = 0;	/* akpm: debug */
 	struct buffer_head *bh = jh2bh(jh);
 
-	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+	jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
 
 	/* Is the existing Revoke bit valid?  If so, we trust it, and
 	 * only perform the full cancel if the revoke bit is set.  If
@@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	if (need_cancel) {
 		record = find_revoke_record(journal, bh->b_blocknr);
 		if (record) {
-			jbd_debug(4, "cancelled existing revoke on "
+			jbd2_debug(4, "cancelled existing revoke on "
 				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
 			spin_lock(&journal->j_revoke_lock);
 			list_del(&record->hash);
@@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
 	}
 	if (descriptor)
 		flush_descriptor(journal, descriptor, offset);
-	jbd_debug(1, "Wrote %d revoke records\n", count);
+	jbd2_debug(1, "Wrote %d revoke records\n", count);
 }
 
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e9c308ae475f..1d1a926b25c5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -373,7 +373,7 @@ alloc_transaction:
 			return -ENOMEM;
 	}
 
-	jbd_debug(3, "New handle %p going live.\n", handle);
+	jbd2_debug(3, "New handle %p going live.\n", handle);
 
 	/*
 	 * We need to hold j_state_lock until t_updates has been incremented,
@@ -453,7 +453,7 @@ repeat:
 	handle->h_start_jiffies = jiffies;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+	jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
 		  handle, blocks,
 		  atomic_read(&transaction->t_outstanding_credits),
 		  jbd2_log_space_left(journal));
@@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 
 	/* Don't extend a locked-down transaction! */
 	if (transaction->t_state != T_RUNNING) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction not running\n", handle, nblocks);
 		goto error_out;
 	}
@@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 				   &transaction->t_outstanding_credits);
 
 	if (wanted > journal->j_max_transaction_buffers) {
-		jbd_debug(3, "denied handle %p %d blocks: "
+		jbd2_debug(3, "denied handle %p %d blocks: "
 			  "transaction too large\n", handle, nblocks);
 		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto error_out;
@@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 	handle->h_revoke_credits_requested += revoke_records;
 	result = 0;
 
-	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+	jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
 error_out:
 	read_unlock(&journal->j_state_lock);
 	return result;
@@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
 	 * First unlink the handle from its current transaction, and start the
 	 * commit on that.
 	 */
-	jbd_debug(2, "restarting handle %p\n", handle);
+	jbd2_debug(2, "restarting handle %p\n", handle);
 	stop_this_handle(handle);
 	handle->h_transaction = NULL;
 
@@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 
 	journal = transaction->t_journal;
 
-	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+	jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 
 	JBUFFER_TRACE(jh, "entry");
 repeat:
@@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 	int err;
 
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	err = -EROFS;
 	if (is_handle_aborted(handle))
 		goto out;
@@ -1496,7 +1496,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
-	jbd_debug(5, "journal_head %p\n", jh);
+	jbd2_debug(5, "journal_head %p\n", jh);
 	JBUFFER_TRACE(jh, "entry");
 
 	/*
@@ -1818,7 +1818,7 @@ int jbd2_journal_stop(handle_t *handle)
 	pid_t pid;
 
 	if (--handle->h_ref > 0) {
-		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+		jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
 						 handle->h_ref);
 		if (is_handle_aborted(handle))
 			return -EIO;
@@ -1838,7 +1838,7 @@ int jbd2_journal_stop(handle_t *handle)
 	if (is_handle_aborted(handle))
 		err = -EIO;
 
-	jbd_debug(4, "Handle %p going down\n", handle);
+	jbd2_debug(4, "Handle %p going down\n", handle);
 	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
 				tid, handle->h_type, handle->h_line_no,
 				jiffies - handle->h_start_jiffies,
@@ -1916,7 +1916,7 @@ int jbd2_journal_stop(handle_t *handle)
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
 
-		jbd_debug(2, "transaction too old, requesting commit for "
+		jbd2_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
 		jbd2_log_start_commit(journal, tid);
@@ -2662,7 +2662,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
 		return -EROFS;
 	journal = transaction->t_journal;
 
-	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+	jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
 
 	spin_lock(&journal->j_list_lock);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index e79d6e0b14e8..d4d59e43769f 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -58,10 +58,10 @@ extern ushort jbd2_journal_enable_debug;
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...);
 
-#define jbd_debug(n, fmt, a...) \
+#define jbd2_debug(n, fmt, a...) \
 	__jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
 #else
-#define jbd_debug(n, fmt, a...)  no_printk(fmt, ##a)
+#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
 extern void *jbd2_alloc(size_t size, gfp_t flags);
-- 
cgit v1.2.3


From 68af74e92a86984ca6d5f7b19ee8f8a30a550873 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:49 +0200
Subject: jbd2: remove unused exports for jbd2 debugging

Jbd2 exports jbd2_journal_enable_debug and __jbd2_debug() depite the
first is used only in fs/jbd2/journal.c and the second only within jbd2
code. Remove the pointless exports make jbd2_journal_enable_debug
static.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-3-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 4 +---
 include/linux/jbd2.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0a8ff211fac1..f38f57942700 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -49,8 +49,7 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
 
 module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
 MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -115,7 +114,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
 	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
 	va_end(args);
 }
-EXPORT_SYMBOL(__jbd2_debug);
 #endif
 
 /* Checksumming functions */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d4d59e43769f..6c2aa61e0f73 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -54,7 +54,6 @@
  * CONFIG_JBD2_DEBUG is on.
  */
 #define JBD2_EXPENSIVE_CHECKING
-extern ushort jbd2_journal_enable_debug;
 void __jbd2_debug(int level, const char *file, const char *func,
 		  unsigned int line, const char *fmt, ...);
 
-- 
cgit v1.2.3


From d1324958567da957385d8d555a8b840b3bf8e6e3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Jun 2022 13:23:50 +0200
Subject: jbd2: unexport jbd2_log_start_commit()

jbd2_log_start_commit() is not used outside of jbd2 so unexport it. Also
make __jbd2_log_start_commit() static when we are at it.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Link: https://lore.kernel.org/r/20220608112355.4397-4-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 3 +--
 include/linux/jbd2.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index f38f57942700..97e205a0689d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -80,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
 EXPORT_SYMBOL(jbd2_journal_clear_err);
 EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
 EXPORT_SYMBOL(jbd2_journal_start_commit);
 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
 EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -479,7 +478,7 @@ repeat:
  * Called with j_state_lock locked for writing.
  * Returns true if a transaction commit was started.
  */
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
 	/* Return if the txn has already requested to be committed */
 	if (journal->j_commit_request == target)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6c2aa61e0f73..164ddf1211c0 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1646,7 +1646,6 @@ extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
  */
 
 int jbd2_log_start_commit(journal_t *journal, tid_t tid);
-int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_transaction_committed(journal_t *journal, tid_t tid);
-- 
cgit v1.2.3


From 3dc96bba65f53daa217f0a8f43edad145286a8f5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:21 +0200
Subject: mbcache: add functions to delete entry if unused

Add function mb_cache_entry_delete_or_get() to delete mbcache entry if
it is unused and also add a function to wait for entry to become unused
- mb_cache_entry_wait_unused(). We do not share code between the two
deleting function as one of them will go away soon.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-2-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mbcache.h | 10 +++++++-
 2 files changed, 73 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index cfc28129fb6f..2010bc80a3f2 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
 /*
  * Mbcache is a simple key-value store. Keys need not be unique, however
  * key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
  *
  * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  * Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
 
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
 static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 					   struct mb_cache_entry *entry,
 					   u32 key)
@@ -217,7 +230,7 @@ out:
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete - try to remove a cache entry
  * @cache - cache we work with
  * @key - key
  * @value - value
@@ -254,6 +267,55 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
 }
 EXPORT_SYMBOL(mb_cache_entry_delete);
 
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
+ * @cache - cache we work with
+ * @key - key
+ * @value - value
+ *
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
+ */
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb_cache_entry *entry;
+
+	head = mb_cache_entry_head(cache, key);
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_value == value) {
+			if (atomic_read(&entry->e_refcnt) > 2) {
+				atomic_inc(&entry->e_refcnt);
+				hlist_bl_unlock(head);
+				return entry;
+			}
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_list_lock);
+			if (!list_empty(&entry->e_list)) {
+				list_del_init(&entry->e_list);
+				if (!WARN_ONCE(cache->c_entry_count == 0,
+		"mbcache: attempt to decrement c_entry_count past zero"))
+					cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_list_lock);
+			mb_cache_entry_put(cache, entry);
+			return NULL;
+		}
+	}
+	hlist_bl_unlock(head);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
+
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
  * @entry - entry that got used
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 20f1e3ff6013..8eca7f25c432 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
 {
-	if (!atomic_dec_and_test(&entry->e_refcnt))
+	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
+
+	if (cnt > 0) {
+		if (cnt <= 3)
+			wake_up_var(&entry->e_refcnt);
 		return 0;
+	}
 	__mb_cache_entry_free(entry);
 	return 1;
 }
 
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+						    u32 key, u64 value);
 void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
-- 
cgit v1.2.3


From 75896339e43176af078509c1fce94ee6df9ca1a7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:28 +0200
Subject: mbcache: Remove mb_cache_entry_delete()

Nobody uses mb_cache_entry_delete() anymore. Remove it.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-9-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 37 -------------------------------------
 include/linux/mbcache.h |  1 -
 2 files changed, 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2010bc80a3f2..d1ebb5df2856 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -230,43 +230,6 @@ out:
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - try to remove a cache entry
- * @cache - cache we work with
- * @key - key
- * @value - value
- *
- * Remove entry from cache @cache with key @key and value @value.
- */
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
-{
-	struct hlist_bl_node *node;
-	struct hlist_bl_head *head;
-	struct mb_cache_entry *entry;
-
-	head = mb_cache_entry_head(cache, key);
-	hlist_bl_lock(head);
-	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			/* We keep hash list reference to keep entry alive */
-			hlist_bl_del_init(&entry->e_hash_list);
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			if (!list_empty(&entry->e_list)) {
-				list_del_init(&entry->e_list);
-				if (!WARN_ONCE(cache->c_entry_count == 0,
-		"mbcache: attempt to decrement c_entry_count past zero"))
-					cache->c_entry_count--;
-				atomic_dec(&entry->e_refcnt);
-			}
-			spin_unlock(&cache->c_list_lock);
-			mb_cache_entry_put(cache, entry);
-			return;
-		}
-	}
-	hlist_bl_unlock(head);
-}
-EXPORT_SYMBOL(mb_cache_entry_delete);
-
 /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
  * @cache - cache we work with
  * @key - key
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 8eca7f25c432..452b579856d4 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -47,7 +47,6 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
 
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
 						    u32 key, u64 value);
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
-- 
cgit v1.2.3


From 307af6c879377c1c63e71cbdd978201f9c7ee8df Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2022 12:54:29 +0200
Subject: mbcache: automatically delete entries from cache on freeing

Use the fact that entries with elevated refcount are not removed from
the hash and just move removal of the entry from the hash to the entry
freeing time. When doing this we also change the generic code to hold
one reference to the cache entry, not two of them, which makes code
somewhat more obvious.

Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20220712105436.32204-10-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/mbcache.c            | 108 +++++++++++++++++-------------------------------
 include/linux/mbcache.h |  24 +++++++----
 2 files changed, 55 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index d1ebb5df2856..96f1d49d30a5 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&entry->e_list);
-	/* One ref for hash, one ref returned */
+	/* Initial hash reference */
 	atomic_set(&entry->e_refcnt, 1);
 	entry->e_key = key;
 	entry->e_value = value;
@@ -106,21 +106,28 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 		}
 	}
 	hlist_bl_add_head(&entry->e_hash_list, head);
-	hlist_bl_unlock(head);
-
+	/*
+	 * Add entry to LRU list before it can be found by
+	 * mb_cache_entry_delete() to avoid races
+	 */
 	spin_lock(&cache->c_list_lock);
 	list_add_tail(&entry->e_list, &cache->c_list);
-	/* Grab ref for LRU list */
-	atomic_inc(&entry->e_refcnt);
 	cache->c_entry_count++;
 	spin_unlock(&cache->c_list_lock);
+	hlist_bl_unlock(head);
 
 	return 0;
 }
 EXPORT_SYMBOL(mb_cache_entry_create);
 
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
 {
+	struct hlist_bl_head *head;
+
+	head = mb_cache_entry_head(cache, entry->e_key);
+	hlist_bl_lock(head);
+	hlist_bl_del(&entry->e_hash_list);
+	hlist_bl_unlock(head);
 	kmem_cache_free(mb_entry_cache, entry);
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
@@ -134,7 +141,7 @@ EXPORT_SYMBOL(__mb_cache_entry_free);
  */
 void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
 {
-	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
 }
 EXPORT_SYMBOL(mb_cache_entry_wait_unused);
 
@@ -155,10 +162,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 	while (node) {
 		entry = hlist_bl_entry(node, struct mb_cache_entry,
 				       e_hash_list);
-		if (entry->e_key == key && entry->e_reusable) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_reusable &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 		node = node->next;
 	}
 	entry = NULL;
@@ -218,10 +224,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 	head = mb_cache_entry_head(cache, key);
 	hlist_bl_lock(head);
 	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			atomic_inc(&entry->e_refcnt);
+		if (entry->e_key == key && entry->e_value == value &&
+		    atomic_inc_not_zero(&entry->e_refcnt))
 			goto out;
-		}
 	}
 	entry = NULL;
 out:
@@ -244,37 +249,25 @@ EXPORT_SYMBOL(mb_cache_entry_get);
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
 						    u32 key, u64 value)
 {
-	struct hlist_bl_node *node;
-	struct hlist_bl_head *head;
 	struct mb_cache_entry *entry;
 
-	head = mb_cache_entry_head(cache, key);
-	hlist_bl_lock(head);
-	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
-		if (entry->e_key == key && entry->e_value == value) {
-			if (atomic_read(&entry->e_refcnt) > 2) {
-				atomic_inc(&entry->e_refcnt);
-				hlist_bl_unlock(head);
-				return entry;
-			}
-			/* We keep hash list reference to keep entry alive */
-			hlist_bl_del_init(&entry->e_hash_list);
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			if (!list_empty(&entry->e_list)) {
-				list_del_init(&entry->e_list);
-				if (!WARN_ONCE(cache->c_entry_count == 0,
-		"mbcache: attempt to decrement c_entry_count past zero"))
-					cache->c_entry_count--;
-				atomic_dec(&entry->e_refcnt);
-			}
-			spin_unlock(&cache->c_list_lock);
-			mb_cache_entry_put(cache, entry);
-			return NULL;
-		}
-	}
-	hlist_bl_unlock(head);
+	entry = mb_cache_entry_get(cache, key, value);
+	if (!entry)
+		return NULL;
+
+	/*
+	 * Drop the ref we got from mb_cache_entry_get() and the initial hash
+	 * ref if we are the last user
+	 */
+	if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+		return entry;
 
+	spin_lock(&cache->c_list_lock);
+	if (!list_empty(&entry->e_list))
+		list_del_init(&entry->e_list);
+	cache->c_entry_count--;
+	spin_unlock(&cache->c_list_lock);
+	__mb_cache_entry_free(cache, entry);
 	return NULL;
 }
 EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
@@ -306,42 +299,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 				     unsigned long nr_to_scan)
 {
 	struct mb_cache_entry *entry;
-	struct hlist_bl_head *head;
 	unsigned long shrunk = 0;
 
 	spin_lock(&cache->c_list_lock);
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
 		entry = list_first_entry(&cache->c_list,
 					 struct mb_cache_entry, e_list);
-		if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) {
+		/* Drop initial hash reference if there is no user */
+		if (entry->e_referenced ||
+		    atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
 			entry->e_referenced = 0;
 			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
 		list_del_init(&entry->e_list);
 		cache->c_entry_count--;
-		/*
-		 * We keep LRU list reference so that entry doesn't go away
-		 * from under us.
-		 */
 		spin_unlock(&cache->c_list_lock);
-		head = mb_cache_entry_head(cache, entry->e_key);
-		hlist_bl_lock(head);
-		/* Now a reliable check if the entry didn't get used... */
-		if (atomic_read(&entry->e_refcnt) > 2) {
-			hlist_bl_unlock(head);
-			spin_lock(&cache->c_list_lock);
-			list_add_tail(&entry->e_list, &cache->c_list);
-			cache->c_entry_count++;
-			continue;
-		}
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		}
-		hlist_bl_unlock(head);
-		if (mb_cache_entry_put(cache, entry))
-			shrunk++;
+		__mb_cache_entry_free(cache, entry);
+		shrunk++;
 		cond_resched();
 		spin_lock(&cache->c_list_lock);
 	}
@@ -433,11 +408,6 @@ void mb_cache_destroy(struct mb_cache *cache)
 	 * point.
 	 */
 	list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
-		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
-			hlist_bl_del_init(&entry->e_hash_list);
-			atomic_dec(&entry->e_refcnt);
-		} else
-			WARN_ON(1);
 		list_del(&entry->e_list);
 		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
 		mb_cache_entry_put(cache, entry);
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 452b579856d4..2da63fd7b98f 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -13,8 +13,16 @@ struct mb_cache;
 struct mb_cache_entry {
 	/* List of entries in cache - protected by cache->c_list_lock */
 	struct list_head	e_list;
-	/* Hash table list - protected by hash chain bitlock */
+	/*
+	 * Hash table list - protected by hash chain bitlock. The entry is
+	 * guaranteed to be hashed while e_refcnt > 0.
+	 */
 	struct hlist_bl_node	e_hash_list;
+	/*
+	 * Entry refcount. Once it reaches zero, entry is unhashed and freed.
+	 * While refcount > 0, the entry is guaranteed to stay in the hash and
+	 * e.g. mb_cache_entry_try_delete() will fail.
+	 */
 	atomic_t		e_refcnt;
 	/* Key in hash - stable during lifetime of the entry */
 	u32			e_key;
@@ -29,20 +37,20 @@ void mb_cache_destroy(struct mb_cache *cache);
 
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
-void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void __mb_cache_entry_free(struct mb_cache *cache,
+			   struct mb_cache_entry *entry);
 void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
-static inline int mb_cache_entry_put(struct mb_cache *cache,
-				     struct mb_cache_entry *entry)
+static inline void mb_cache_entry_put(struct mb_cache *cache,
+				      struct mb_cache_entry *entry)
 {
 	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
 
 	if (cnt > 0) {
-		if (cnt <= 3)
+		if (cnt <= 2)
 			wake_up_var(&entry->e_refcnt);
-		return 0;
+		return;
 	}
-	__mb_cache_entry_free(entry);
-	return 1;
+	__mb_cache_entry_free(cache, entry);
 }
 
 struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
-- 
cgit v1.2.3


From b6e8d40d43ae4dec00c8fea2593eeea3114b8f44 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 2 Aug 2022 21:54:51 -0400
Subject: sched, cpuset: Fix dl_cpu_busy() panic due to empty cs->cpus_allowed

With cgroup v2, the cpuset's cpus_allowed mask can be empty indicating
that the cpuset will just use the effective CPUs of its parent. So
cpuset_can_attach() can call task_can_attach() with an empty mask.
This can lead to cpumask_any_and() returns nr_cpu_ids causing the call
to dl_bw_of() to crash due to percpu value access of an out of bound
CPU value. For example:

	[80468.182258] BUG: unable to handle page fault for address: ffffffff8b6648b0
	  :
	[80468.191019] RIP: 0010:dl_cpu_busy+0x30/0x2b0
	  :
	[80468.207946] Call Trace:
	[80468.208947]  cpuset_can_attach+0xa0/0x140
	[80468.209953]  cgroup_migrate_execute+0x8c/0x490
	[80468.210931]  cgroup_update_dfl_csses+0x254/0x270
	[80468.211898]  cgroup_subtree_control_write+0x322/0x400
	[80468.212854]  kernfs_fop_write_iter+0x11c/0x1b0
	[80468.213777]  new_sync_write+0x11f/0x1b0
	[80468.214689]  vfs_write+0x1eb/0x280
	[80468.215592]  ksys_write+0x5f/0xe0
	[80468.216463]  do_syscall_64+0x5c/0x80
	[80468.224287]  entry_SYSCALL_64_after_hwframe+0x44/0xae

Fix that by using effective_cpus instead. For cgroup v1, effective_cpus
is the same as cpus_allowed. For v2, effective_cpus is the real cpumask
to be used by tasks within the cpuset anyway.

Also update task_can_attach()'s 2nd argument name to cs_effective_cpus to
reflect the change. In addition, a check is added to task_can_attach()
to guard against the possibility that cpumask_any_and() may return a
value >= nr_cpu_ids.

Fixes: 7f51412a415d ("sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20220803015451.2219567-1-longman@redhat.com
---
 include/linux/sched.h  | 2 +-
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c    | 8 +++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 88b8817b827d..6a060160f0db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1813,7 +1813,7 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags)
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
+extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
 #ifdef CONFIG_SMP
 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 71a418858a5e..58aadfda9b8b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 		goto out_unlock;
 
 	cgroup_taskset_for_each(task, css, tset) {
-		ret = task_can_attach(task, cs->cpus_allowed);
+		ret = task_can_attach(task, cs->effective_cpus);
 		if (ret)
 			goto out_unlock;
 		ret = security_task_setscheduler(task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5555e49c4e12..addc3c2d2122 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8980,7 +8980,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 }
 
 int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_cpus_allowed)
+		    const struct cpumask *cs_effective_cpus)
 {
 	int ret = 0;
 
@@ -8999,9 +8999,11 @@ int task_can_attach(struct task_struct *p,
 	}
 
 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_cpus_allowed)) {
-		int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+					      cs_effective_cpus)) {
+		int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
 
+		if (unlikely(cpu >= nr_cpu_ids))
+			return -EINVAL;
 		ret = dl_cpu_busy(cpu, p);
 	}
 
-- 
cgit v1.2.3


From a8af0d682ae0c9cf62dd0ad6afdb1480951d6a10 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 30 Jun 2022 16:21:50 -0400
Subject: libceph: clean up ceph_osdc_start_request prototype

This function always returns 0, and ignores the nofail boolean. Drop the
nofail argument, make the function void return and fix up the callers.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  6 +++---
 fs/ceph/addr.c                  | 33 +++++++++++++--------------------
 fs/ceph/file.c                  | 32 +++++++++++++-------------------
 include/linux/ceph/osd_client.h |  5 ++---
 net/ceph/osd_client.c           | 15 ++++++---------
 5 files changed, 37 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ef9bc62e9afd..b4580b73479f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1297,7 +1297,7 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
-	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
+	ceph_osdc_start_request(osd_req->r_osdc, osd_req);
 }
 
 /*
@@ -2081,7 +2081,7 @@ static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
 	if (ret)
 		return ret;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	return 0;
 }
 
@@ -4768,7 +4768,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	if (ret)
 		goto out_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0)
 		ceph_copy_from_page_vector(pages, buf, 0, ret);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index de12715c237b..ec76e77f8d4b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -337,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	/* should always give us a page-aligned read */
 	WARN_ON_ONCE(page_off);
 	len = err;
+	err = 0;
 
 	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
 	req->r_callback = finish_netfs_read;
@@ -344,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
 	req->r_inode = inode;
 	ihold(inode);
 
-	err = ceph_osdc_start_request(req->r_osdc, req, false);
-	if (err)
-		iput(inode);
+	ceph_osdc_start_request(req->r_osdc, req);
 out:
 	ceph_osdc_put_request(req);
 	if (err)
@@ -620,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(osdc, req, true);
-	if (!err)
-		err = ceph_osdc_wait_request(osdc, req);
+	ceph_osdc_start_request(osdc, req);
+	err = ceph_osdc_wait_request(osdc, req);
 
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, err);
@@ -1150,8 +1148,7 @@ new_request:
 		}
 
 		req->r_mtime = inode->i_mtime;
-		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
-		BUG_ON(rc);
+		ceph_osdc_start_request(&fsc->client->osdc, req);
 		req = NULL;
 
 		wbc->nr_to_write -= i;
@@ -1692,9 +1689,8 @@ int ceph_uninline_data(struct file *file)
 	}
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 	ceph_osdc_put_request(req);
 	if (err < 0)
 		goto out_unlock;
@@ -1735,9 +1731,8 @@ int ceph_uninline_data(struct file *file)
 	}
 
 	req->r_mtime = inode->i_mtime;
-	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 				  req->r_end_latency, len, err);
@@ -1908,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 
 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
 				     0, false, true);
-	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+	ceph_osdc_start_request(&fsc->client->osdc, rd_req);
 
 	wr_req->r_mtime = ci->netfs.inode.i_mtime;
-	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+	ceph_osdc_start_request(&fsc->client->osdc, wr_req);
 
-	if (!err)
-		err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
-	if (!err2)
-		err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+	err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+	err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
 
 	if (err >= 0 || err == -ENOENT)
 		have |= POOL_READ;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b4e978420802..c3caa9bf9755 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -985,9 +985,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
 						 false, false);
-		ret = ceph_osdc_start_request(osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(osdc, req);
+		ceph_osdc_start_request(osdc, req);
+		ret = ceph_osdc_wait_request(osdc, req);
 
 		ceph_update_read_metrics(&fsc->mdsc->metric,
 					 req->r_start_latency,
@@ -1250,7 +1249,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_inode = inode;
 	req->r_priv = aio_req;
 
-	ret = ceph_osdc_start_request(req->r_osdc, req, false);
+	ceph_osdc_start_request(req->r_osdc, req);
 out:
 	if (ret < 0) {
 		req->r_result = ret;
@@ -1387,9 +1386,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			continue;
 		}
 
-		ret = ceph_osdc_start_request(req->r_osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		ceph_osdc_start_request(req->r_osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 		if (write)
 			ceph_update_write_metrics(metric, req->r_start_latency,
@@ -1452,8 +1450,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					       r_private_item);
 			list_del_init(&req->r_private_item);
 			if (ret >= 0)
-				ret = ceph_osdc_start_request(req->r_osdc,
-							      req, false);
+				ceph_osdc_start_request(req->r_osdc, req);
 			if (ret < 0) {
 				req->r_result = ret;
 				ceph_aio_complete_req(req);
@@ -1566,9 +1563,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 						false, true);
 
 		req->r_mtime = mtime;
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-		if (!ret)
-			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		ceph_osdc_start_request(&fsc->client->osdc, req);
+		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
 		ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
 					  req->r_end_latency, len, ret);
@@ -2032,12 +2028,10 @@ static int ceph_zero_partial_object(struct inode *inode,
 	}
 
 	req->r_mtime = inode->i_mtime;
-	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret) {
-		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-		if (ret == -ENOENT)
-			ret = 0;
-	}
+	ceph_osdc_start_request(&fsc->client->osdc, req);
+	ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	if (ret == -ENOENT)
+		ret = 0;
 	ceph_osdc_put_request(req);
 
 out:
@@ -2339,7 +2333,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
 		if (IS_ERR(req))
 			ret = PTR_ERR(req);
 		else {
-			ceph_osdc_start_request(osdc, req, false);
+			ceph_osdc_start_request(osdc, req);
 			ret = ceph_osdc_wait_request(osdc, req);
 			ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
 						     req->r_start_latency,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cba8a6ffc329..fb6be72104df 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -507,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				   struct ceph_osd_request *req,
-				   bool nofail);
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req);
 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 				  struct ceph_osd_request *req);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9d82bb42e958..87b883c7bfd6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4578,15 +4578,12 @@ bad:
 /*
  * Register request, send initial attempt.
  */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
 {
 	down_read(&osdc->lock);
 	submit_request(req, false);
 	up_read(&osdc->lock);
-
-	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
@@ -4756,7 +4753,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	linger_cancel(lreq);
 	linger_put(lreq);
 	ret = wait_request_timeout(req, opts->mount_timeout);
@@ -4827,7 +4824,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 
 out_put_req:
@@ -5043,7 +5040,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		void *p = page_address(pages[0]);
@@ -5120,7 +5117,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
-- 
cgit v1.2.3


From bed4593645366ad7362a3aa7bc0d100d8d8236a8 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Mon, 11 Jul 2022 09:17:38 +0800
Subject: tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH

If DEBUG_SECTION_MISMATCH enabled, __calc_tpm2_event_size() will not be
inlined, this cause section mismatch like this:

WARNING: modpost: vmlinux.o(.text.unlikely+0xe30c): Section mismatch in reference from the variable L0 to the function .init.text:early_ioremap()
The function L0() references
the function __init early_memremap().
This is often because L0 lacks a __init
annotation or the annotation of early_ioremap is wrong.

Fix it by using __always_inline instead of inline for the called-once
function __calc_tpm2_event_size().

Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
Cc: stable@vger.kernel.org # v5.3
Reported-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm_eventlog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 739ba9a03ec1..20c0ff54b7a0 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -157,7 +157,7 @@ struct tcg_algorithm_info {
  * Return: size of the event on success, 0 on failure
  */
 
-static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 					 struct tcg_pcr_event *event_header,
 					 bool do_mapping)
 {
-- 
cgit v1.2.3


From 6930bcbfb6ceda63e298c6af6d733ecdf6bd4cde Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 1 Aug 2022 15:57:26 -0400
Subject: lockd: detect and reject lock arguments that overflow

lockd doesn't currently vet the start and length in nlm4 requests like
it should, and can end up generating lock requests with arguments that
overflow when passed to the filesystem.

The NLM4 protocol uses unsigned 64-bit arguments for both start and
length, whereas struct file_lock tracks the start and end as loff_t
values. By the time we get around to calling nlm4svc_retrieve_args,
we've lost the information that would allow us to determine if there was
an overflow.

Start tracking the actual start and len for NLM4 requests in the
nlm_lock. In nlm4svc_retrieve_args, vet these values to ensure they
won't cause an overflow, and return NLM4_FBIG if they do.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=392
Reported-by: Jan Kasiak <j.kasiak@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <stable@vger.kernel.org> # 5.14+
---
 fs/lockd/svc4proc.c       |  8 ++++++++
 fs/lockd/xdr4.c           | 19 ++-----------------
 include/linux/lockd/xdr.h |  2 ++
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4f247ab8be61..bf274f23969b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -32,6 +32,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	if (!nlmsvc_ops)
 		return nlm_lck_denied_nolocks;
 
+	if (lock->lock_start > OFFSET_MAX ||
+	    (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start))))
+		return nlm4_fbig;
+
 	/* Obtain host handle */
 	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
 	 || (argp->monitor && nsm_monitor(host) < 0))
@@ -50,6 +54,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 		/* Set up the missing parts of the file_lock structure */
 		lock->fl.fl_file  = file->f_file[mode];
 		lock->fl.fl_pid = current->tgid;
+		lock->fl.fl_start = (loff_t)lock->lock_start;
+		lock->fl.fl_end = lock->lock_len ?
+				   (loff_t)(lock->lock_start + lock->lock_len - 1) :
+				   OFFSET_MAX;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
 		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
 		if (!lock->fl.fl_owner) {
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 856267c0864b..712fdfeb8ef0 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -20,13 +20,6 @@
 
 #include "svcxdr.h"
 
-static inline loff_t
-s64_to_loff_t(__s64 offset)
-{
-	return (loff_t)offset;
-}
-
-
 static inline s64
 loff_t_to_s64(loff_t offset)
 {
@@ -70,8 +63,6 @@ static bool
 svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 {
 	struct file_lock *fl = &lock->fl;
-	u64 len, start;
-	s64 end;
 
 	if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
 		return false;
@@ -81,20 +72,14 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 		return false;
 	if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
 		return false;
-	if (xdr_stream_decode_u64(xdr, &start) < 0)
+	if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0)
 		return false;
-	if (xdr_stream_decode_u64(xdr, &len) < 0)
+	if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0)
 		return false;
 
 	locks_init_lock(fl);
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;
-	end = start + len - 1;
-	fl->fl_start = s64_to_loff_t(start);
-	if (len == 0 || end < 0)
-		fl->fl_end = OFFSET_MAX;
-	else
-		fl->fl_end = s64_to_loff_t(end);
 
 	return true;
 }
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 398f70093cd3..67e4a2c5500b 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -41,6 +41,8 @@ struct nlm_lock {
 	struct nfs_fh		fh;
 	struct xdr_netobj	oh;
 	u32			svid;
+	u64			lock_start;
+	u64			lock_len;
 	struct file_lock	fl;
 };
 
-- 
cgit v1.2.3


From f482aa98652795846cc55da98ebe331eb74f3d0b Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Wed, 3 Aug 2022 15:23:43 -0700
Subject: audit, io_uring, io-wq: Fix memory leak in io_sq_thread() and
 io_wqe_worker()

Currently @audit_context is allocated twice for io_uring workers:

  1. copy_process() calls audit_alloc();
  2. io_sq_thread() or io_wqe_worker() calls audit_alloc_kernel() (which
     is effectively audit_alloc()) and overwrites @audit_context,
     causing:

  BUG: memory leak
  unreferenced object 0xffff888144547400 (size 1024):
<...>
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
      00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    backtrace:
      [<ffffffff8135cfc3>] audit_alloc+0x133/0x210
      [<ffffffff81239e63>] copy_process+0xcd3/0x2340
      [<ffffffff8123b5f3>] create_io_thread+0x63/0x90
      [<ffffffff81686604>] create_io_worker+0xb4/0x230
      [<ffffffff81686f68>] io_wqe_enqueue+0x248/0x3b0
      [<ffffffff8167663a>] io_queue_iowq+0xba/0x200
      [<ffffffff816768b3>] io_queue_async+0x113/0x180
      [<ffffffff816840df>] io_req_task_submit+0x18f/0x1a0
      [<ffffffff816841cd>] io_apoll_task_func+0xdd/0x120
      [<ffffffff8167d49f>] tctx_task_work+0x11f/0x570
      [<ffffffff81272c4e>] task_work_run+0x7e/0xc0
      [<ffffffff8125a688>] get_signal+0xc18/0xf10
      [<ffffffff8111645b>] arch_do_signal_or_restart+0x2b/0x730
      [<ffffffff812ea44e>] exit_to_user_mode_prepare+0x5e/0x180
      [<ffffffff844ae1b2>] syscall_exit_to_user_mode+0x12/0x20
      [<ffffffff844a7e80>] do_syscall_64+0x40/0x80

Then,

  3. io_sq_thread() or io_wqe_worker() frees @audit_context using
     audit_free();
  4. do_exit() eventually calls audit_free() again, which is okay
     because audit_free() does a NULL check.

As suggested by Paul Moore, fix it by deleting audit_alloc_kernel() and
redundant audit_free() calls.

Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Suggested-by: Paul Moore <paul@paul-moore.com>
Cc: stable@vger.kernel.org
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/r/20220803222343.31673-1-yepeilin.cs@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/audit.h |  5 -----
 io_uring/io-wq.c      |  3 ---
 io_uring/sqpoll.c     |  4 ----
 kernel/auditsc.c      | 25 -------------------------
 4 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 00f7a80f1a3e..3608992848d3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -285,7 +285,6 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 /* These are defined in auditsc.c */
 				/* Public API */
 extern int  audit_alloc(struct task_struct *task);
-extern int  audit_alloc_kernel(struct task_struct *task);
 extern void __audit_free(struct task_struct *task);
 extern void __audit_uring_entry(u8 op);
 extern void __audit_uring_exit(int success, long code);
@@ -578,10 +577,6 @@ static inline int audit_alloc(struct task_struct *task)
 {
 	return 0;
 }
-static inline int audit_alloc_kernel(struct task_struct *task)
-{
-	return 0;
-}
 static inline void audit_free(struct task_struct *task)
 { }
 static inline void audit_uring_entry(u8 op)
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 77df5b43bf52..c6536d4b2da0 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -624,8 +624,6 @@ static int io_wqe_worker(void *data)
 	snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
 	set_task_comm(current, buf);
 
-	audit_alloc_kernel(current);
-
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		long ret;
 
@@ -660,7 +658,6 @@ static int io_wqe_worker(void *data)
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		io_worker_handle_work(worker);
 
-	audit_free(current);
 	io_worker_exit(worker);
 	return 0;
 }
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 76d4d70c733a..559652380672 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -235,8 +235,6 @@ static int io_sq_thread(void *data)
 		set_cpus_allowed_ptr(current, cpu_online_mask);
 	current->flags |= PF_NO_SETAFFINITY;
 
-	audit_alloc_kernel(current);
-
 	mutex_lock(&sqd->lock);
 	while (1) {
 		bool cap_entries, sqt_spin = false;
@@ -310,8 +308,6 @@ static int io_sq_thread(void *data)
 	io_run_task_work();
 	mutex_unlock(&sqd->lock);
 
-	audit_free(current);
-
 	complete(&sqd->exited);
 	do_exit(0);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3a8c9d744800..dd8d9ab747c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1073,31 +1073,6 @@ int audit_alloc(struct task_struct *tsk)
 	return 0;
 }
 
-/**
- * audit_alloc_kernel - allocate an audit_context for a kernel task
- * @tsk: the kernel task
- *
- * Similar to the audit_alloc() function, but intended for kernel private
- * threads.  Returns zero on success, negative values on failure.
- */
-int audit_alloc_kernel(struct task_struct *tsk)
-{
-	/*
-	 * At the moment we are just going to call into audit_alloc() to
-	 * simplify the code, but there two things to keep in mind with this
-	 * approach:
-	 *
-	 * 1. Filtering internal kernel tasks is a bit laughable in almost all
-	 * cases, but there is at least one case where there is a benefit:
-	 * the '-a task,never' case allows the admin to effectively disable
-	 * task auditing at runtime.
-	 *
-	 * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
-	 * on these internal kernel tasks, but they probably don't hurt either.
-	 */
-	return audit_alloc(tsk);
-}
-
 static inline void audit_free_context(struct audit_context *context)
 {
 	/* resetting is extra work, but it is likely just noise */
-- 
cgit v1.2.3


From e2dcac2f58f5a95ab092d1da237ffdc0da1832cf Mon Sep 17 00:00:00 2001
From: Jinghao Jia <jinghao@linux.ibm.com>
Date: Fri, 29 Jul 2022 20:17:13 +0000
Subject: BPF: Fix potential bad pointer dereference in bpf_sys_bpf()

The bpf_sys_bpf() helper function allows an eBPF program to load another
eBPF program from within the kernel. In this case the argument union
bpf_attr pointer (as well as the insns and license pointers inside) is a
kernel address instead of a userspace address (which is the case of a
usual bpf() syscall). To make the memory copying process in the syscall
work in both cases, bpfptr_t was introduced to wrap around the pointer
and distinguish its origin. Specifically, when copying memory contents
from a bpfptr_t, a copy_from_user() is performed in case of a userspace
address and a memcpy() is performed for a kernel address.

This can lead to problems because the in-kernel pointer is never checked
for validity. The problem happens when an eBPF syscall program tries to
call bpf_sys_bpf() to load a program but provides a bad insns pointer --
say 0xdeadbeef -- in the bpf_attr union. The helper calls __sys_bpf()
which would then call bpf_prog_load() to load the program.
bpf_prog_load() is responsible for copying the eBPF instructions to the
newly allocated memory for the program; it creates a kernel bpfptr_t for
insns and invokes copy_from_bpfptr(). Internally, all bpfptr_t
operations are backed by the corresponding sockptr_t operations, which
performs direct memcpy() on kernel pointers for copy_from/strncpy_from
operations. Therefore, the code is always happy to dereference the bad
pointer to trigger a un-handle-able page fault and in turn an oops.
However, this is not supposed to happen because at that point the eBPF
program is already verified and should not cause a memory error.

Sample KASAN trace:

[   25.685056][  T228] ==================================================================
[   25.685680][  T228] BUG: KASAN: user-memory-access in copy_from_bpfptr+0x21/0x30
[   25.686210][  T228] Read of size 80 at addr 00000000deadbeef by task poc/228
[   25.686732][  T228]
[   25.686893][  T228] CPU: 3 PID: 228 Comm: poc Not tainted 5.19.0-rc7 #7
[   25.687375][  T228] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS d55cb5a 04/01/2014
[   25.687991][  T228] Call Trace:
[   25.688223][  T228]  <TASK>
[   25.688429][  T228]  dump_stack_lvl+0x73/0x9e
[   25.688747][  T228]  print_report+0xea/0x200
[   25.689061][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.689401][  T228]  ? _printk+0x54/0x6e
[   25.689693][  T228]  ? _raw_spin_lock_irqsave+0x70/0xd0
[   25.690071][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.690412][  T228]  kasan_report+0xb5/0xe0
[   25.690716][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691059][  T228]  kasan_check_range+0x2bd/0x2e0
[   25.691405][  T228]  ? copy_from_bpfptr+0x21/0x30
[   25.691734][  T228]  memcpy+0x25/0x60
[   25.692000][  T228]  copy_from_bpfptr+0x21/0x30
[   25.692328][  T228]  bpf_prog_load+0x604/0x9e0
[   25.692653][  T228]  ? cap_capable+0xb4/0xe0
[   25.692956][  T228]  ? security_capable+0x4f/0x70
[   25.693324][  T228]  __sys_bpf+0x3af/0x580
[   25.693635][  T228]  bpf_sys_bpf+0x45/0x240
[   25.693937][  T228]  bpf_prog_f0ec79a5a3caca46_bpf_func1+0xa2/0xbd
[   25.694394][  T228]  bpf_prog_run_pin_on_cpu+0x2f/0xb0
[   25.694756][  T228]  bpf_prog_test_run_syscall+0x146/0x1c0
[   25.695144][  T228]  bpf_prog_test_run+0x172/0x190
[   25.695487][  T228]  __sys_bpf+0x2c5/0x580
[   25.695776][  T228]  __x64_sys_bpf+0x3a/0x50
[   25.696084][  T228]  do_syscall_64+0x60/0x90
[   25.696393][  T228]  ? fpregs_assert_state_consistent+0x50/0x60
[   25.696815][  T228]  ? exit_to_user_mode_prepare+0x36/0xa0
[   25.697202][  T228]  ? syscall_exit_to_user_mode+0x20/0x40
[   25.697586][  T228]  ? do_syscall_64+0x6e/0x90
[   25.697899][  T228]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   25.698312][  T228] RIP: 0033:0x7f6d543fb759
[   25.698624][  T228] Code: 08 5b 89 e8 5d c3 66 2e 0f 1f 84 00 00 00 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 a6 0e 00 f7 d8 64 89 01 48
[   25.699946][  T228] RSP: 002b:00007ffc3df78468 EFLAGS: 00000287 ORIG_RAX: 0000000000000141
[   25.700526][  T228] RAX: ffffffffffffffda RBX: 00007ffc3df78628 RCX: 00007f6d543fb759
[   25.701071][  T228] RDX: 0000000000000090 RSI: 00007ffc3df78478 RDI: 000000000000000a
[   25.701636][  T228] RBP: 00007ffc3df78510 R08: 0000000000000000 R09: 0000000000300000
[   25.702191][  T228] R10: 0000000000000005 R11: 0000000000000287 R12: 0000000000000000
[   25.702736][  T228] R13: 00007ffc3df78638 R14: 000055a1584aca68 R15: 00007f6d5456a000
[   25.703282][  T228]  </TASK>
[   25.703490][  T228] ==================================================================
[   25.704050][  T228] Disabling lock debugging due to kernel taint

Update copy_from_bpfptr() and strncpy_from_bpfptr() so that:
 - for a kernel pointer, it uses the safe copy_from_kernel_nofault() and
   strncpy_from_kernel_nofault() functions.
 - for a userspace pointer, it performs copy_from_user() and
   strncpy_from_user().

Fixes: af2ac3e13e45 ("bpf: Prepare bpf syscall to be used from kernel and user space.")
Link: https://lore.kernel.org/bpf/20220727132905.45166-1-jinghao@linux.ibm.com/
Signed-off-by: Jinghao Jia <jinghao@linux.ibm.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220729201713.88688-1-jinghao@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpfptr.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 46e1757d06a3..79b2f78eec1a 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -49,7 +49,9 @@ static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
 static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
 					  size_t offset, size_t size)
 {
-	return copy_from_sockptr_offset(dst, (sockptr_t) src, offset, size);
+	if (!bpfptr_is_kernel(src))
+		return copy_from_user(dst, src.user + offset, size);
+	return copy_from_kernel_nofault(dst, src.kernel + offset, size);
 }
 
 static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
@@ -78,7 +80,9 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
 {
-	return strncpy_from_sockptr(dst, (sockptr_t) src, count);
+	if (bpfptr_is_kernel(src))
+		return strncpy_from_kernel_nofault(dst, src.kernel, count);
+	return strncpy_from_user(dst, src.user, count);
 }
 
 #endif /* _LINUX_BPFPTR_H */
-- 
cgit v1.2.3


From 3c59366c207e4c6c6569524af606baf017a55c61 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 1 Aug 2022 10:33:34 +1000
Subject: NFS: don't unhash dentry during unlink/rename

NFS unlink() (and rename over existing target) must determine if the
file is open, and must perform a "silly rename" instead of an unlink (or
before rename) if it is.  Otherwise the client might hold a file open
which has been removed on the server.

Consequently if it determines that the file isn't open, it must block
any subsequent opens until the unlink/rename has been completed on the
server.

This is currently achieved by unhashing the dentry.  This forces any
open attempt to the slow-path for lookup which will block on i_rwsem on
the directory until the unlink/rename completes.  A future patch will
change the VFS to only get a shared lock on i_rwsem for unlink, so this
will no longer work.

Instead we introduce an explicit interlock.  A special value is stored
in dentry->d_fsdata while the unlink/rename is running and
->d_revalidate blocks while that value is present.  When ->d_revalidate
unblocks, the dentry will be invalid.  This closes the race
without requiring exclusion on i_rwsem.

d_fsdata is already used in two different ways.
1/ an IS_ROOT directory dentry might have a "devname" stored in
   d_fsdata.  Such a dentry doesn't have a name and so cannot be the
   target of unlink or rename.  For safety we check if an old devname
   is still stored, and remove it if it is.
2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct
   nfs_unlinkdata' stored in d_fsdata.  While this is set maydelete()
   will fail, so an unlink or rename will never proceed on such
   a dentry.

Neither of these can be in effect when a dentry is the target of unlink
or rename.  So we can expect d_fsdata to be NULL, and store a special
value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate
that any lookup will be blocked.

The d_count() is incremented under d_lock() when a lookup finds the
dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under
the same lock to avoid any races.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 72 +++++++++++++++++++++++++++++++++++++-------------
 include/linux/nfs_fs.h |  9 +++++++
 2 files changed, 63 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7b2230297f6b..dbab3caa15ed 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1782,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 	int ret;
 
 	if (flags & LOOKUP_RCU) {
+		if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+			return -ECHILD;
 		parent = READ_ONCE(dentry->d_parent);
 		dir = d_inode_rcu(parent);
 		if (!dir)
@@ -1790,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
 		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 	} else {
+		/* Wait for unlink to complete */
+		wait_var_event(&dentry->d_fsdata,
+			       dentry->d_fsdata != NFS_FSDATA_BLOCKED);
 		parent = dget_parent(dentry);
 		ret = reval(d_inode(parent), dentry, flags);
 		dput(parent);
@@ -2458,7 +2463,6 @@ out:
 int nfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error;
-	int need_rehash = 0;
 
 	dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
 		dir->i_ino, dentry);
@@ -2473,15 +2477,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		error = nfs_sillyrename(dir, dentry);
 		goto out;
 	}
-	if (!d_unhashed(dentry)) {
-		__d_drop(dentry);
-		need_rehash = 1;
-	}
+	/* We must prevent any concurrent open until the unlink
+	 * completes.  ->d_revalidate will wait for ->d_fsdata
+	 * to clear.  We set it here to ensure no lookup succeeds until
+	 * the unlink is complete on the server.
+	 */
+	error = -ETXTBSY;
+	if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+	    WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+		goto out;
+	if (dentry->d_fsdata)
+		/* old devname */
+		kfree(dentry->d_fsdata);
+	dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
 	spin_unlock(&dentry->d_lock);
 	error = nfs_safe_remove(dentry);
 	nfs_dentry_remove_handle_error(dir, dentry, error);
-	if (need_rehash)
-		d_rehash(dentry);
+	dentry->d_fsdata = NULL;
+	wake_up_var(&dentry->d_fsdata);
 out:
 	trace_nfs_unlink_exit(dir, dentry, error);
 	return error;
@@ -2588,6 +2602,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 }
 EXPORT_SYMBOL_GPL(nfs_link);
 
+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+	struct dentry *new_dentry = data->new_dentry;
+
+	new_dentry->d_fsdata = NULL;
+	wake_up_var(&new_dentry->d_fsdata);
+}
+
 /*
  * RENAME
  * FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2618,8 +2641,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 {
 	struct inode *old_inode = d_inode(old_dentry);
 	struct inode *new_inode = d_inode(new_dentry);
-	struct dentry *dentry = NULL, *rehash = NULL;
+	struct dentry *dentry = NULL;
 	struct rpc_task *task;
+	bool must_unblock = false;
 	int error = -EBUSY;
 
 	if (flags)
@@ -2637,18 +2661,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	 * the new target.
 	 */
 	if (new_inode && !S_ISDIR(new_inode->i_mode)) {
-		/*
-		 * To prevent any new references to the target during the
-		 * rename, we unhash the dentry in advance.
+		/* We must prevent any concurrent open until the unlink
+		 * completes.  ->d_revalidate will wait for ->d_fsdata
+		 * to clear.  We set it here to ensure no lookup succeeds until
+		 * the unlink is complete on the server.
 		 */
-		if (!d_unhashed(new_dentry)) {
-			d_drop(new_dentry);
-			rehash = new_dentry;
+		error = -ETXTBSY;
+		if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+		    WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+			goto out;
+		if (new_dentry->d_fsdata) {
+			/* old devname */
+			kfree(new_dentry->d_fsdata);
+			new_dentry->d_fsdata = NULL;
 		}
 
+		spin_lock(&new_dentry->d_lock);
 		if (d_count(new_dentry) > 2) {
 			int err;
 
+			spin_unlock(&new_dentry->d_lock);
+
 			/* copy the target dentry's name */
 			dentry = d_alloc(new_dentry->d_parent,
 					 &new_dentry->d_name);
@@ -2661,14 +2694,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 				goto out;
 
 			new_dentry = dentry;
-			rehash = NULL;
 			new_inode = NULL;
+		} else {
+			new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+			must_unblock = true;
+			spin_unlock(&new_dentry->d_lock);
 		}
+
 	}
 
 	if (S_ISREG(old_inode->i_mode))
 		nfs_sync_inode(old_inode);
-	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+	task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+				must_unblock ? nfs_unblock_rename : NULL);
 	if (IS_ERR(task)) {
 		error = PTR_ERR(task);
 		goto out;
@@ -2692,8 +2730,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 		spin_unlock(&old_inode->i_lock);
 	}
 out:
-	if (rehash)
-		d_rehash(rehash);
 	trace_nfs_rename_exit(old_dir, old_dentry,
 			new_dir, new_dentry, error);
 	if (!error) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a17c337dbdf1..b32ed68e7dc4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid)
 
 #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
 
+/* We need to block new opens while a file is being unlinked.
+ * If it is opened *before* we decide to unlink, we will silly-rename
+ * instead. If it is opened *after*, then we need to create or will fail.
+ * If we allow the two to race, we could end up with a file that is open
+ * but deleted on the server resulting in ESTALE.
+ * So use ->d_fsdata to record when the unlink is happening
+ * and block dentry revalidation while it is set.
+ */
+#define NFS_FSDATA_BLOCKED ((void*)1)
 
 # undef ifdebug
 # ifdef NFS_DEBUG
-- 
cgit v1.2.3


From 2da1c30929a28c3c6b01d9c16c4216037be95597 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:28 +0800
Subject: mm: hugetlb_vmemmap: delete hugetlb_optimize_vmemmap_enabled()

Patch series "Simplify hugetlb vmemmap and improve its readability", v2.

This series aims to simplify hugetlb vmemmap and improve its readability.


This patch (of 8):

The name hugetlb_optimize_vmemmap_enabled() a bit confusing as it tests
two conditions (enabled and pages in use).  Instead of coming up to an
appropriate name, we could just delete it.  There is already a discussion
about deleting it in thread [1].

There is only one user of hugetlb_optimize_vmemmap_enabled() outside of
hugetlb_vmemmap, that is flush_dcache_page() in arch/arm64/mm/flush.c.
However, it does not need to call hugetlb_optimize_vmemmap_enabled() in
flush_dcache_page() since HugeTLB pages are always fully mapped and only
head page will be set PG_dcache_clean meaning only head page's flag may
need to be cleared (see commit cf5a501d985b).  So it is easy to remove
hugetlb_optimize_vmemmap_enabled().

Link: https://lore.kernel.org/all/c77c61c8-8a5a-87e8-db89-d04d8aaab4cc@oracle.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-2-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/flush.c      | 13 +++----------
 include/linux/page-flags.h | 14 ++------------
 2 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index fc4f710e9820..5f9379b3c8c8 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -76,17 +76,10 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
 void flush_dcache_page(struct page *page)
 {
 	/*
-	 * Only the head page's flags of HugeTLB can be cleared since the tail
-	 * vmemmap pages associated with each HugeTLB page are mapped with
-	 * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
-	 * details can refer to vmemmap_remap_pte()).  Although
-	 * __sync_icache_dcache() only set PG_dcache_clean flag on the head
-	 * page struct, there is more than one page struct with PG_dcache_clean
-	 * associated with the HugeTLB page since the head vmemmap page frame
-	 * is reused (more details can refer to the comments above
-	 * page_fixed_fake_head()).
+	 * HugeTLB pages are always fully mapped and only head page will be
+	 * set PG_dcache_clean (see comments in __sync_icache_dcache()).
 	 */
-	if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
+	if (PageHuge(page))
 		page = compound_head(page);
 
 	if (test_bit(PG_dcache_clean, &page->flags))
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ea19528564d1..2455405ab82b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,12 +208,6 @@ enum pageflags {
 DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
 			 hugetlb_optimize_vmemmap_key);
 
-static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
-{
-	return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-				   &hugetlb_optimize_vmemmap_key);
-}
-
 /*
  * If the feature of optimizing vmemmap pages associated with each HugeTLB
  * page is enabled, the head vmemmap page frame is reused and all of the tail
@@ -232,7 +226,8 @@ static __always_inline bool hugetlb_optimize_vmemmap_enabled(void)
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!hugetlb_optimize_vmemmap_enabled())
+	if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
+				 &hugetlb_optimize_vmemmap_key))
 		return page;
 
 	/*
@@ -260,11 +255,6 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
 {
 	return page;
 }
-
-static inline bool hugetlb_optimize_vmemmap_enabled(void)
-{
-	return false;
-}
 #endif
 
 static __always_inline int page_is_fake_head(struct page *page)
-- 
cgit v1.2.3


From cf5472e561133888df81d2e48f7da9ebd3299459 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:29 +0800
Subject: mm: hugetlb_vmemmap: optimize vmemmap_optimize_mode handling

We hold an another reference to hugetlb_optimize_vmemmap_key when making
vmemmap_optimize_mode on, because we use static_key to tell memory_hotplug
that memory_hotplug.memmap_on_memory should be overridden.  However, this
rule has gone when we have introduced PageVmemmapSelfHosted.  Therefore,
we could simplify vmemmap_optimize_mode handling by not holding an another
reference to hugetlb_optimize_vmemmap_key.  This also means that we not
incur the extra page_fixed_fake_head checks if there are no vmemmap
optinmized hugetlb pages after this change.

Link: https://lkml.kernel.org/r/20220628092235.91270-3-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  6 ++---
 mm/hugetlb_vmemmap.c       | 65 +++++-----------------------------------------
 2 files changed, 9 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2455405ab82b..b44cc24d7496 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -205,8 +205,7 @@ enum pageflags {
 #ifndef __GENERATING_BOUNDS_H
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-			 hugetlb_optimize_vmemmap_key);
+DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
  * If the feature of optimizing vmemmap pages associated with each HugeTLB
@@ -226,8 +225,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-	if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-				 &hugetlb_optimize_vmemmap_key))
+	if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
 		return page;
 
 	/*
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1362feb3c6c9..e5b83a25c2fa 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -23,42 +23,15 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
-enum vmemmap_optimize_mode {
-	VMEMMAP_OPTIMIZE_OFF,
-	VMEMMAP_OPTIMIZE_ON,
-};
-
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
-			hugetlb_optimize_vmemmap_key);
+DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
-static enum vmemmap_optimize_mode vmemmap_optimize_mode =
+static bool vmemmap_optimize_enabled =
 	IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 
-static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
-{
-	if (vmemmap_optimize_mode == to)
-		return;
-
-	if (to == VMEMMAP_OPTIMIZE_OFF)
-		static_branch_dec(&hugetlb_optimize_vmemmap_key);
-	else
-		static_branch_inc(&hugetlb_optimize_vmemmap_key);
-	WRITE_ONCE(vmemmap_optimize_mode, to);
-}
-
 static int __init hugetlb_vmemmap_early_param(char *buf)
 {
-	bool enable;
-	enum vmemmap_optimize_mode mode;
-
-	if (kstrtobool(buf, &enable))
-		return -EINVAL;
-
-	mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
-	vmemmap_optimize_mode_switch(mode);
-
-	return 0;
+	return kstrtobool(buf, &vmemmap_optimize_enabled);
 }
 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
 
@@ -100,7 +73,7 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 static unsigned int vmemmap_optimizable_pages(struct hstate *h,
 					      struct page *head)
 {
-	if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+	if (!READ_ONCE(vmemmap_optimize_enabled))
 		return 0;
 
 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
@@ -191,7 +164,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	if (!is_power_of_2(sizeof(struct page))) {
 		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
-		static_branch_disable(&hugetlb_optimize_vmemmap_key);
 		return;
 	}
 
@@ -212,36 +184,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 }
 
 #ifdef CONFIG_PROC_SYSCTL
-static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
-					    void *buffer, size_t *length,
-					    loff_t *ppos)
-{
-	int ret;
-	enum vmemmap_optimize_mode mode;
-	static DEFINE_MUTEX(sysctl_mutex);
-
-	if (write && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&sysctl_mutex);
-	mode = vmemmap_optimize_mode;
-	table->data = &mode;
-	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (write && !ret)
-		vmemmap_optimize_mode_switch(mode);
-	mutex_unlock(&sysctl_mutex);
-
-	return ret;
-}
-
 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{
 		.procname	= "hugetlb_optimize_vmemmap",
-		.maxlen		= sizeof(enum vmemmap_optimize_mode),
+		.data		= &vmemmap_optimize_enabled,
+		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= hugetlb_optimize_vmemmap_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.proc_handler	= proc_dobool,
 	},
 	{ }
 };
-- 
cgit v1.2.3


From dff033818a06e7d0bf79271e34bda11c2d9d98d0 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:30 +0800
Subject: mm: hugetlb_vmemmap: introduce the name HVO

It it inconvenient to mention the feature of optimizing vmemmap pages
associated with HugeTLB pages when communicating with others since there
is no specific or abbreviated name for it when it is first introduced.
Let us give it a name HVO (HugeTLB Vmemmap Optimization) from now.

This commit also updates the document about "hugetlb_free_vmemmap" by the
way discussed in thread [1].

Link: https://lore.kernel.org/all/21aae898-d54d-cc4b-a11f-1bb7fddcfffa@redhat.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 ++++---
 Documentation/admin-guide/mm/hugetlbpage.rst    |  4 ++--
 Documentation/admin-guide/mm/memory-hotplug.rst |  4 ++--
 Documentation/admin-guide/sysctl/vm.rst         |  3 +--
 Documentation/mm/vmemmap_dedup.rst              |  2 ++
 fs/Kconfig                                      | 12 +++++-------
 include/linux/page-flags.h                      |  3 +--
 mm/hugetlb_vmemmap.c                            |  8 ++++----
 mm/hugetlb_vmemmap.h                            |  4 ++--
 9 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bab2b0bf5988..8c06271835b0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1731,12 +1731,13 @@
 	hugetlb_free_vmemmap=
 			[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 			enabled.
+			Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
-			Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
+			Format: { on | off (default) }
 
-			[oO][Nn]/Y/y/1: enable the feature
-			[oO][Ff]/N/n/0: disable the feature
+			on: enable HVO
+			off: disable HVO
 
 			Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
 			the default is on.
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index a90330d0a837..8e2727dc18d4 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -164,8 +164,8 @@ default_hugepagesz
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
 hugetlb_free_vmemmap
-	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
-	unused vmemmap pages associated with each HugeTLB page.
+	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB
+	Vmemmap Optimization (HVO).
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 0f56ecd8ac05..a3c9e8ad8fa0 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -653,8 +653,8 @@ block might fail:
 - Concurrent activity that operates on the same physical memory area, such as
   allocating gigantic pages, can result in temporary offlining failures.
 
-- Out of memory when dissolving huge pages, especially when freeing unused
-  vmemmap pages associated with each hugetlb page is enabled.
+- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap
+  Optimization (HVO) is enabled.
 
   Offlining code may be able to migrate huge page contents, but may not be able
   to dissolve the source huge page because it fails allocating (unmovable) pages
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index f74f722ad702..9b833e439f09 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -569,8 +569,7 @@ This knob is not available when the size of 'struct page' (a structure defined
 in include/linux/mm_types.h) is not power of two (an unusual system config could
 result in this).
 
-Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
-associated with each HugeTLB page.
+Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO).
 
 Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
 buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index c9c495f62d12..7d7a161aa364 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -7,6 +7,8 @@ A vmemmap diet for HugeTLB and Device DAX
 HugeTLB
 =======
 
+This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
+
 The struct page structures (page structs) are used to describe a physical
 page frame. By default, there is a one-to-one mapping from a page frame to
 it's corresponding page struct.
diff --git a/fs/Kconfig b/fs/Kconfig
index 5976eb33535f..a547307c1ae8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -247,8 +247,7 @@ config HUGETLB_PAGE
 
 #
 # Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of minimizing overhead of struct page associated with
-# each HugeTLB page.
+# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
 #
 config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	bool
@@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
-	bool "Default optimizing vmemmap pages of HugeTLB to on"
+	bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
 	default n
 	depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
-	  When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
-	  pages associated with each HugeTLB page is default off. Say Y here
-	  to enable optimizing vmemmap pages of HugeTLB by default. It can then
-	  be disabled on the command line via hugetlb_free_vmemmap=off.
+	  The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
+	  enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+	  (boot command line) or hugetlb_optimize_vmemmap (sysctl).
 
 config MEMFD_CREATE
 	def_bool TMPFS || HUGETLBFS
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index b44cc24d7496..78ed46ae6ee5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,8 +208,7 @@ enum pageflags {
 DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
- * If the feature of optimizing vmemmap pages associated with each HugeTLB
- * page is enabled, the head vmemmap page frame is reused and all of the tail
+ * If HVO is enabled, the head vmemmap page frame is reused and all of the tail
  * vmemmap addresses map to the head vmemmap page frame (furture details can
  * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
  * words, there are more than one page struct with PG_head associated with each
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index e5b83a25c2fa..bcafd9d7639c 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
  *
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  *
@@ -156,8 +156,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
 
 	/*
 	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
-	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+	 * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON
+	 * to catch invalid usage of the tail page structs.
 	 */
 	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
 		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 109b0a53b6fe..ba66fadad9fc 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Optimize vmemmap pages associated with HugeTLB
+ * HugeTLB Vmemmap Optimization (HVO)
  *
- * Copyright (c) 2020, Bytedance. All rights reserved.
+ * Copyright (c) 2020, ByteDance. All rights reserved.
  *
  *     Author: Muchun Song <songmuchun@bytedance.com>
  */
-- 
cgit v1.2.3


From 998a2997885f73e5cc732ac6d661dfa6e0f50654 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:31 +0800
Subject: mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to
 hugetlb_vmemmap.c

When I first introduced vmemmap manipulation functions related to HugeTLB,
I thought those functions may be reused by other modules (e.g.  using
similar approach to optimize vmemmap pages, unfortunately, the DAX used
the same approach but does not use those functions).  After two years, we
didn't see any other users.  So move those functions to hugetlb_vmemmap.c.
Code movement without any functional change.

Link: https://lkml.kernel.org/r/20220628092235.91270-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |   7 -
 mm/hugetlb_vmemmap.c | 399 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/sparse-vmemmap.c  | 399 ---------------------------------------------------
 3 files changed, 398 insertions(+), 407 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 18e01474cf6b..e6e201a4ce05 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3142,13 +3142,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int vmemmap_remap_free(unsigned long start, unsigned long end,
-		       unsigned long reuse);
-int vmemmap_remap_alloc(unsigned long start, unsigned long end,
-			unsigned long reuse, gfp_t gfp_mask);
-#endif
-
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index bcafd9d7639c..f68e216600b9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -10,9 +10,31 @@
  */
 #define pr_fmt(fmt)	"HugeTLB: " fmt
 
-#include <linux/memory.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
 
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte:		called for each lowest-level entry (PTE).
+ * @nr_walked:		the number of walked pte.
+ * @reuse_page:		the page which is reused for the tail vmemmap pages.
+ * @reuse_addr:		the virtual address of the @reuse_page page.
+ * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
+ *			or is mapped from.
+ */
+struct vmemmap_remap_walk {
+	void			(*remap_pte)(pte_t *pte, unsigned long addr,
+					     struct vmemmap_remap_walk *walk);
+	unsigned long		nr_walked;
+	struct page		*reuse_page;
+	unsigned long		reuse_addr;
+	struct list_head	*vmemmap_pages;
+};
+
 /*
  * There are a lot of struct page structures associated with each HugeTLB page.
  * For tail pages, the value of compound_head is the same. So we can reuse first
@@ -23,6 +45,381 @@
 #define RESERVE_VMEMMAP_NR		1U
 #define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	pmd_t __pmd;
+	int i;
+	unsigned long addr = start;
+	struct page *page = pmd_page(*pmd);
+	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+	if (!pgtable)
+		return -ENOMEM;
+
+	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+		pte_t entry, *pte;
+		pgprot_t pgprot = PAGE_KERNEL;
+
+		entry = mk_pte(page + i, pgprot);
+		pte = pte_offset_kernel(&__pmd, addr);
+		set_pte_at(&init_mm, addr, pte, entry);
+	}
+
+	spin_lock(&init_mm.page_table_lock);
+	if (likely(pmd_leaf(*pmd))) {
+		/*
+		 * Higher order allocations from buddy allocator must be able to
+		 * be treated as indepdenent small pages (as they can be freed
+		 * individually).
+		 */
+		if (!PageReserved(page))
+			split_page(page, get_order(PMD_SIZE));
+
+		/* Make pte visible before pmd. See comment in pmd_install(). */
+		smp_wmb();
+		pmd_populate_kernel(&init_mm, pmd, pgtable);
+		flush_tlb_kernel_range(start, start + PMD_SIZE);
+	} else {
+		pte_free_kernel(&init_mm, pgtable);
+	}
+	spin_unlock(&init_mm.page_table_lock);
+
+	return 0;
+}
+
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+	int leaf;
+
+	spin_lock(&init_mm.page_table_lock);
+	leaf = pmd_leaf(*pmd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	if (!leaf)
+		return 0;
+
+	return __split_vmemmap_huge_pmd(pmd, start);
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+			      unsigned long end,
+			      struct vmemmap_remap_walk *walk)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	/*
+	 * The reuse_page is found 'first' in table walk before we start
+	 * remapping (which is calling @walk->remap_pte).
+	 */
+	if (!walk->reuse_page) {
+		walk->reuse_page = pte_page(*pte);
+		/*
+		 * Because the reuse address is part of the range that we are
+		 * walking, skip the reuse address range.
+		 */
+		addr += PAGE_SIZE;
+		pte++;
+		walk->nr_walked++;
+	}
+
+	for (; addr != end; addr += PAGE_SIZE, pte++) {
+		walk->remap_pte(pte, addr, walk);
+		walk->nr_walked++;
+	}
+}
+
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		int ret;
+
+		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+		if (ret)
+			return ret;
+
+		next = pmd_addr_end(addr, end);
+		vmemmap_pte_range(pmd, addr, next, walk);
+	} while (pmd++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(p4d, addr);
+	do {
+		int ret;
+
+		next = pud_addr_end(addr, end);
+		ret = vmemmap_pmd_range(pud, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pud++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+			     unsigned long end,
+			     struct vmemmap_remap_walk *walk)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_offset(pgd, addr);
+	do {
+		int ret;
+
+		next = p4d_addr_end(addr, end);
+		ret = vmemmap_pud_range(p4d, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (p4d++, addr = next, addr != end);
+
+	return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+			       struct vmemmap_remap_walk *walk)
+{
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgd;
+
+	VM_BUG_ON(!PAGE_ALIGNED(start));
+	VM_BUG_ON(!PAGE_ALIGNED(end));
+
+	pgd = pgd_offset_k(addr);
+	do {
+		int ret;
+
+		next = pgd_addr_end(addr, end);
+		ret = vmemmap_p4d_range(pgd, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pgd++, addr = next, addr != end);
+
+	/*
+	 * We only change the mapping of the vmemmap virtual address range
+	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+	 * belongs to the range.
+	 */
+	flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+	return 0;
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+	if (PageReserved(page))
+		free_bootmem_page(page);
+	else
+		__free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		list_del(&page->lru);
+		free_vmemmap_page(page);
+	}
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+			      struct vmemmap_remap_walk *walk)
+{
+	/*
+	 * Remap the tail pages as read-only to catch illegal write operation
+	 * to the tail pages.
+	 */
+	pgprot_t pgprot = PAGE_KERNEL_RO;
+	pte_t entry = mk_pte(walk->reuse_page, pgprot);
+	struct page *page = pte_page(*pte);
+
+	list_add_tail(&page->lru, walk->vmemmap_pages);
+	set_pte_at(&init_mm, addr, pte, entry);
+}
+
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE		3
+
+static inline void reset_struct_pages(struct page *start)
+{
+	int i;
+	struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+		memcpy(start + i, from, sizeof(*from));
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+				struct vmemmap_remap_walk *walk)
+{
+	pgprot_t pgprot = PAGE_KERNEL;
+	struct page *page;
+	void *to;
+
+	BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+	list_del(&page->lru);
+	to = page_to_virt(page);
+	copy_page(to, (void *)walk->reuse_addr);
+	reset_struct_pages(to);
+
+	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ *			to the page which @reuse is mapped to, then free vmemmap
+ *			which the range are mapped to.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_free(unsigned long start, unsigned long end,
+			      unsigned long reuse)
+{
+	int ret;
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_remap_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/*
+	 * In order to make remapping routine most efficient for the huge pages,
+	 * the routine of vmemmap page table walking has the following rules
+	 * (see more details from the vmemmap_pte_range()):
+	 *
+	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+	 *   should be continuous.
+	 * - The @reuse address is part of the range [@reuse, @end) that we are
+	 *   walking which is passed to vmemmap_remap_range().
+	 * - The @reuse address is the first in the complete range.
+	 *
+	 * So we need to make sure that @start and @reuse meet the above rules.
+	 */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	mmap_read_lock(&init_mm);
+	ret = vmemmap_remap_range(reuse, end, &walk);
+	if (ret && walk.nr_walked) {
+		end = reuse + walk.nr_walked * PAGE_SIZE;
+		/*
+		 * vmemmap_pages contains pages from the previous
+		 * vmemmap_remap_range call which failed.  These
+		 * are pages which were removed from the vmemmap.
+		 * They will be restored in the following call.
+		 */
+		walk = (struct vmemmap_remap_walk) {
+			.remap_pte	= vmemmap_restore_pte,
+			.reuse_addr	= reuse,
+			.vmemmap_pages	= &vmemmap_pages,
+		};
+
+		vmemmap_remap_range(reuse, end, &walk);
+	}
+	mmap_read_unlock(&init_mm);
+
+	free_vmemmap_page_list(&vmemmap_pages);
+
+	return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+				   gfp_t gfp_mask, struct list_head *list)
+{
+	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+	int nid = page_to_nid((struct page *)start);
+	struct page *page, *next;
+
+	while (nr_pages--) {
+		page = alloc_pages_node(nid, gfp_mask, 0);
+		if (!page)
+			goto out;
+		list_add_tail(&page->lru, list);
+	}
+
+	return 0;
+out:
+	list_for_each_entry_safe(page, next, list, lru)
+		__free_pages(page, 0);
+	return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ *			 to the page which is from the @vmemmap_pages
+ *			 respectively.
+ * @start:	start address of the vmemmap virtual address range that we want
+ *		to remap.
+ * @end:	end address of the vmemmap virtual address range that we want to
+ *		remap.
+ * @reuse:	reuse address.
+ * @gfp_mask:	GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+			       unsigned long reuse, gfp_t gfp_mask)
+{
+	LIST_HEAD(vmemmap_pages);
+	struct vmemmap_remap_walk walk = {
+		.remap_pte	= vmemmap_restore_pte,
+		.reuse_addr	= reuse,
+		.vmemmap_pages	= &vmemmap_pages,
+	};
+
+	/* See the comment in the vmemmap_remap_free(). */
+	BUG_ON(start - reuse != PAGE_SIZE);
+
+	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+		return -ENOMEM;
+
+	mmap_read_lock(&init_mm);
+	vmemmap_remap_range(reuse, end, &walk);
+	mmap_read_unlock(&init_mm);
+
+	return 0;
+}
+
 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 5f0ed4717ed0..46ae542118c0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,408 +27,9 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
-#include <linux/pgtable.h>
-#include <linux/bootmem_info.h>
 
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-/**
- * struct vmemmap_remap_walk - walk vmemmap page table
- *
- * @remap_pte:		called for each lowest-level entry (PTE).
- * @nr_walked:		the number of walked pte.
- * @reuse_page:		the page which is reused for the tail vmemmap pages.
- * @reuse_addr:		the virtual address of the @reuse_page page.
- * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
- *			or is mapped from.
- */
-struct vmemmap_remap_walk {
-	void (*remap_pte)(pte_t *pte, unsigned long addr,
-			  struct vmemmap_remap_walk *walk);
-	unsigned long nr_walked;
-	struct page *reuse_page;
-	unsigned long reuse_addr;
-	struct list_head *vmemmap_pages;
-};
-
-static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
-	pmd_t __pmd;
-	int i;
-	unsigned long addr = start;
-	struct page *page = pmd_page(*pmd);
-	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
-
-	if (!pgtable)
-		return -ENOMEM;
-
-	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
-
-	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
-		pte_t entry, *pte;
-		pgprot_t pgprot = PAGE_KERNEL;
-
-		entry = mk_pte(page + i, pgprot);
-		pte = pte_offset_kernel(&__pmd, addr);
-		set_pte_at(&init_mm, addr, pte, entry);
-	}
-
-	spin_lock(&init_mm.page_table_lock);
-	if (likely(pmd_leaf(*pmd))) {
-		/*
-		 * Higher order allocations from buddy allocator must be able to
-		 * be treated as indepdenent small pages (as they can be freed
-		 * individually).
-		 */
-		if (!PageReserved(page))
-			split_page(page, get_order(PMD_SIZE));
-
-		/* Make pte visible before pmd. See comment in pmd_install(). */
-		smp_wmb();
-		pmd_populate_kernel(&init_mm, pmd, pgtable);
-		flush_tlb_kernel_range(start, start + PMD_SIZE);
-	} else {
-		pte_free_kernel(&init_mm, pgtable);
-	}
-	spin_unlock(&init_mm.page_table_lock);
-
-	return 0;
-}
-
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
-{
-	int leaf;
-
-	spin_lock(&init_mm.page_table_lock);
-	leaf = pmd_leaf(*pmd);
-	spin_unlock(&init_mm.page_table_lock);
-
-	if (!leaf)
-		return 0;
-
-	return __split_vmemmap_huge_pmd(pmd, start);
-}
-
-static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
-			      unsigned long end,
-			      struct vmemmap_remap_walk *walk)
-{
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	/*
-	 * The reuse_page is found 'first' in table walk before we start
-	 * remapping (which is calling @walk->remap_pte).
-	 */
-	if (!walk->reuse_page) {
-		walk->reuse_page = pte_page(*pte);
-		/*
-		 * Because the reuse address is part of the range that we are
-		 * walking, skip the reuse address range.
-		 */
-		addr += PAGE_SIZE;
-		pte++;
-		walk->nr_walked++;
-	}
-
-	for (; addr != end; addr += PAGE_SIZE, pte++) {
-		walk->remap_pte(pte, addr, walk);
-		walk->nr_walked++;
-	}
-}
-
-static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		int ret;
-
-		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
-		if (ret)
-			return ret;
-
-		next = pmd_addr_end(addr, end);
-		vmemmap_pte_range(pmd, addr, next, walk);
-	} while (pmd++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_offset(p4d, addr);
-	do {
-		int ret;
-
-		next = pud_addr_end(addr, end);
-		ret = vmemmap_pmd_range(pud, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (pud++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
-			     unsigned long end,
-			     struct vmemmap_remap_walk *walk)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_offset(pgd, addr);
-	do {
-		int ret;
-
-		next = p4d_addr_end(addr, end);
-		ret = vmemmap_pud_range(p4d, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (p4d++, addr = next, addr != end);
-
-	return 0;
-}
-
-static int vmemmap_remap_range(unsigned long start, unsigned long end,
-			       struct vmemmap_remap_walk *walk)
-{
-	unsigned long addr = start;
-	unsigned long next;
-	pgd_t *pgd;
-
-	VM_BUG_ON(!PAGE_ALIGNED(start));
-	VM_BUG_ON(!PAGE_ALIGNED(end));
-
-	pgd = pgd_offset_k(addr);
-	do {
-		int ret;
-
-		next = pgd_addr_end(addr, end);
-		ret = vmemmap_p4d_range(pgd, addr, next, walk);
-		if (ret)
-			return ret;
-	} while (pgd++, addr = next, addr != end);
-
-	/*
-	 * We only change the mapping of the vmemmap virtual address range
-	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
-	 * belongs to the range.
-	 */
-	flush_tlb_kernel_range(start + PAGE_SIZE, end);
-
-	return 0;
-}
-
-/*
- * Free a vmemmap page. A vmemmap page can be allocated from the memblock
- * allocator or buddy allocator. If the PG_reserved flag is set, it means
- * that it allocated from the memblock allocator, just free it via the
- * free_bootmem_page(). Otherwise, use __free_page().
- */
-static inline void free_vmemmap_page(struct page *page)
-{
-	if (PageReserved(page))
-		free_bootmem_page(page);
-	else
-		__free_page(page);
-}
-
-/* Free a list of the vmemmap pages */
-static void free_vmemmap_page_list(struct list_head *list)
-{
-	struct page *page, *next;
-
-	list_for_each_entry_safe(page, next, list, lru) {
-		list_del(&page->lru);
-		free_vmemmap_page(page);
-	}
-}
-
-static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
-			      struct vmemmap_remap_walk *walk)
-{
-	/*
-	 * Remap the tail pages as read-only to catch illegal write operation
-	 * to the tail pages.
-	 */
-	pgprot_t pgprot = PAGE_KERNEL_RO;
-	pte_t entry = mk_pte(walk->reuse_page, pgprot);
-	struct page *page = pte_page(*pte);
-
-	list_add_tail(&page->lru, walk->vmemmap_pages);
-	set_pte_at(&init_mm, addr, pte, entry);
-}
-
-/*
- * How many struct page structs need to be reset. When we reuse the head
- * struct page, the special metadata (e.g. page->flags or page->mapping)
- * cannot copy to the tail struct page structs. The invalid value will be
- * checked in the free_tail_pages_check(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
- * structs.
- */
-#define NR_RESET_STRUCT_PAGE		3
-
-static inline void reset_struct_pages(struct page *start)
-{
-	int i;
-	struct page *from = start + NR_RESET_STRUCT_PAGE;
-
-	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
-		memcpy(start + i, from, sizeof(*from));
-}
-
-static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
-				struct vmemmap_remap_walk *walk)
-{
-	pgprot_t pgprot = PAGE_KERNEL;
-	struct page *page;
-	void *to;
-
-	BUG_ON(pte_page(*pte) != walk->reuse_page);
-
-	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
-	list_del(&page->lru);
-	to = page_to_virt(page);
-	copy_page(to, (void *)walk->reuse_addr);
-	reset_struct_pages(to);
-
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
-}
-
-/**
- * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
- *			to the page which @reuse is mapped to, then free vmemmap
- *			which the range are mapped to.
- * @start:	start address of the vmemmap virtual address range that we want
- *		to remap.
- * @end:	end address of the vmemmap virtual address range that we want to
- *		remap.
- * @reuse:	reuse address.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_free(unsigned long start, unsigned long end,
-		       unsigned long reuse)
-{
-	int ret;
-	LIST_HEAD(vmemmap_pages);
-	struct vmemmap_remap_walk walk = {
-		.remap_pte	= vmemmap_remap_pte,
-		.reuse_addr	= reuse,
-		.vmemmap_pages	= &vmemmap_pages,
-	};
-
-	/*
-	 * In order to make remapping routine most efficient for the huge pages,
-	 * the routine of vmemmap page table walking has the following rules
-	 * (see more details from the vmemmap_pte_range()):
-	 *
-	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
-	 *   should be continuous.
-	 * - The @reuse address is part of the range [@reuse, @end) that we are
-	 *   walking which is passed to vmemmap_remap_range().
-	 * - The @reuse address is the first in the complete range.
-	 *
-	 * So we need to make sure that @start and @reuse meet the above rules.
-	 */
-	BUG_ON(start - reuse != PAGE_SIZE);
-
-	mmap_read_lock(&init_mm);
-	ret = vmemmap_remap_range(reuse, end, &walk);
-	if (ret && walk.nr_walked) {
-		end = reuse + walk.nr_walked * PAGE_SIZE;
-		/*
-		 * vmemmap_pages contains pages from the previous
-		 * vmemmap_remap_range call which failed.  These
-		 * are pages which were removed from the vmemmap.
-		 * They will be restored in the following call.
-		 */
-		walk = (struct vmemmap_remap_walk) {
-			.remap_pte	= vmemmap_restore_pte,
-			.reuse_addr	= reuse,
-			.vmemmap_pages	= &vmemmap_pages,
-		};
-
-		vmemmap_remap_range(reuse, end, &walk);
-	}
-	mmap_read_unlock(&init_mm);
-
-	free_vmemmap_page_list(&vmemmap_pages);
-
-	return ret;
-}
-
-static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
-				   gfp_t gfp_mask, struct list_head *list)
-{
-	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
-	int nid = page_to_nid((struct page *)start);
-	struct page *page, *next;
-
-	while (nr_pages--) {
-		page = alloc_pages_node(nid, gfp_mask, 0);
-		if (!page)
-			goto out;
-		list_add_tail(&page->lru, list);
-	}
-
-	return 0;
-out:
-	list_for_each_entry_safe(page, next, list, lru)
-		__free_pages(page, 0);
-	return -ENOMEM;
-}
-
-/**
- * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
- *			 to the page which is from the @vmemmap_pages
- *			 respectively.
- * @start:	start address of the vmemmap virtual address range that we want
- *		to remap.
- * @end:	end address of the vmemmap virtual address range that we want to
- *		remap.
- * @reuse:	reuse address.
- * @gfp_mask:	GFP flag for allocating vmemmap pages.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int vmemmap_remap_alloc(unsigned long start, unsigned long end,
-			unsigned long reuse, gfp_t gfp_mask)
-{
-	LIST_HEAD(vmemmap_pages);
-	struct vmemmap_remap_walk walk = {
-		.remap_pte	= vmemmap_restore_pte,
-		.reuse_addr	= reuse,
-		.vmemmap_pages	= &vmemmap_pages,
-	};
-
-	/* See the comment in the vmemmap_remap_free(). */
-	BUG_ON(start - reuse != PAGE_SIZE);
-
-	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
-		return -ENOMEM;
-
-	mmap_read_lock(&init_mm);
-	vmemmap_remap_range(reuse, end, &walk);
-	mmap_read_unlock(&init_mm);
-
-	return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
 /*
  * Allocate a block of memory to be used to back the virtual memory map
-- 
cgit v1.2.3


From 6213834c10de954470b7195cf0cdbda858edf0ee Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:33 +0800
Subject: mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability

There is a discussion about the name of hugetlb_vmemmap_alloc/free in
thread [1].  The suggestion suggested by David is rename "alloc/free" to
"optimize/restore" to make functionalities clearer to users, "optimize"
means the function will optimize vmemmap pages, while "restore" means
restoring its vmemmap pages discared before.  This commit does this.

Another discussion is the confusion RESERVE_VMEMMAP_NR isn't used
explicitly for vmemmap_addr but implicitly for vmemmap_end in
hugetlb_vmemmap_alloc/free.  David suggested we can compute what
hugetlb_vmemmap_init() does now at runtime.  We do not need to worry for
the overhead of computing at runtime since the calculation is simple
enough and those functions are not in a hot path.  This commit has the
following improvements:

  1) The function suffixed name ("optimize/restore") is more expressive.
  2) The logic becomes less weird in hugetlb_vmemmap_optimize/restore().
  3) The hugetlb_vmemmap_init() does not need to be exported anymore.
  4) A ->optimize_vmemmap_pages field in struct hstate is killed.
  5) There is only one place where checks is_power_of_2(sizeof(struct
     page)) instead of two places.
  6) Add more comments for hugetlb_vmemmap_optimize/restore().
  7) For external users, hugetlb_optimize_vmemmap_pages() is used for
     detecting if the HugeTLB's vmemmap pages is optimizable originally.
     In this commit, it is killed and we introduce a new helper
     hugetlb_vmemmap_optimizable() to replace it.  The name is more
     expressive.

Link: https://lore.kernel.org/all/20220404074652.68024-2-songmuchun@bytedance.com/ [1]
Link: https://lkml.kernel.org/r/20220628092235.91270-7-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |   7 +--
 include/linux/sysctl.h  |   4 ++
 mm/hugetlb.c            |  15 ++---
 mm/hugetlb_vmemmap.c    | 143 ++++++++++++++++++++----------------------------
 mm/hugetlb_vmemmap.h    |  41 +++++++++-----
 5 files changed, 102 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4cdfce976644..6d0620edf0a6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -638,9 +638,6 @@ struct hstate {
 	unsigned int nr_huge_pages_node[MAX_NUMNODES];
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-	unsigned int optimize_vmemmap_pages;
-#endif
 #ifdef CONFIG_CGROUP_HUGETLB
 	/* cgroup control files */
 	struct cftype cgroup_files_dfl[8];
@@ -716,7 +713,7 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 	return hstate_file(vma->vm_file);
 }
 
-static inline unsigned long huge_page_size(struct hstate *h)
+static inline unsigned long huge_page_size(const struct hstate *h)
 {
 	return (unsigned long)PAGE_SIZE << h->order;
 }
@@ -745,7 +742,7 @@ static inline bool hstate_is_gigantic(struct hstate *h)
 	return huge_page_order(h) >= MAX_ORDER;
 }
 
-static inline unsigned int pages_per_huge_page(struct hstate *h)
+static inline unsigned int pages_per_huge_page(const struct hstate *h)
 {
 	return 1 << h->order;
 }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 17b42ce89d3e..780690dc08cd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -268,6 +268,10 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table *
 	return NULL;
 }
 
+static inline void register_sysctl_init(const char *path, struct ctl_table *table)
+{
+}
+
 static inline struct ctl_table_header *register_sysctl_mount_point(const char *path)
 {
 	return NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f044962ad9df..598c37279fee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1535,7 +1535,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
-	if (hugetlb_vmemmap_alloc(h, page)) {
+	if (hugetlb_vmemmap_restore(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
 		 * If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1612,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
 
 static inline void flush_free_hpage_work(struct hstate *h)
 {
-	if (hugetlb_optimize_vmemmap_pages(h))
+	if (hugetlb_vmemmap_optimizable(h))
 		flush_work(&free_hpage_work);
 }
 
@@ -1734,7 +1734,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
 
 static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
-	hugetlb_vmemmap_free(h, page);
+	hugetlb_vmemmap_optimize(h, page);
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
 	hugetlb_set_page_subpool(page, NULL);
@@ -2107,7 +2107,7 @@ retry:
 		 * Attempt to allocate vmemmmap here so that we can take
 		 * appropriate action on failure.
 		 */
-		rc = hugetlb_vmemmap_alloc(h, head);
+		rc = hugetlb_vmemmap_restore(h, head);
 		if (!rc) {
 			/*
 			 * Move PageHWPoison flag from head page to the raw
@@ -3182,8 +3182,10 @@ static void __init report_hugepages(void)
 		char buf[32];
 
 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
-		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
 			buf, h->free_huge_pages);
+		pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
+			hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
 	}
 }
 
@@ -3421,7 +3423,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
 	remove_hugetlb_page_for_demote(h, page, false);
 	spin_unlock_irq(&hugetlb_lock);
 
-	rc = hugetlb_vmemmap_alloc(h, page);
+	rc = hugetlb_vmemmap_restore(h, page);
 	if (rc) {
 		/* Allocation of vmemmmap failed, we can not demote page */
 		spin_lock_irq(&hugetlb_lock);
@@ -4111,7 +4113,6 @@ void __init hugetlb_add_hstate(unsigned int order)
 	h->next_nid_to_free = first_memory_node;
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
-	hugetlb_vmemmap_init(h);
 
 	parsed_hstate = h;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 6c7117c30e56..8da2b31bb59f 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -35,16 +35,6 @@ struct vmemmap_remap_walk {
 	struct list_head	*vmemmap_pages;
 };
 
-/*
- * There are a lot of struct page structures associated with each HugeTLB page.
- * For tail pages, the value of compound_head is the same. So we can reuse first
- * page of head page structures. We map the virtual addresses of all the pages
- * of tail page structures to the head page struct, and then free these page
- * frames. Therefore, we need to reserve one pages as vmemmap areas.
- */
-#define RESERVE_VMEMMAP_NR		1U
-#define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-
 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
 {
 	pmd_t __pmd;
@@ -426,32 +416,37 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 
-/*
- * Previously discarded vmemmap pages will be allocated and remapping
- * after this function returns zero.
+/**
+ * hugetlb_vmemmap_restore - restore previously optimized (by
+ *			     hugetlb_vmemmap_optimize()) vmemmap pages which
+ *			     will be reallocated and remapped.
+ * @h:		struct hstate.
+ * @head:	the head page whose vmemmap pages will be restored.
+ *
+ * Return: %0 if @head's vmemmap pages have been reallocated and remapped,
+ * negative error code otherwise.
  */
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 {
 	int ret;
-	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
 
 	if (!HPageVmemmapOptimized(head))
 		return 0;
 
-	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
-	vmemmap_pages	= hugetlb_optimize_vmemmap_pages(h);
-	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
-	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse	= vmemmap_start;
+	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
 
 	/*
-	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
+	 * The pages which the vmemmap virtual address range [@vmemmap_start,
 	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
 	 * When a HugeTLB page is freed to the buddy allocator, previously
 	 * discarded vmemmap pages must be allocated and remapping.
 	 */
-	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
 				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
 	if (!ret) {
 		ClearHPageVmemmapOptimized(head);
@@ -461,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
 	return ret;
 }
 
-static unsigned int vmemmap_optimizable_pages(struct hstate *h,
-					      struct page *head)
+/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
+static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
 	if (!READ_ONCE(vmemmap_optimize_enabled))
-		return 0;
+		return false;
+
+	if (!hugetlb_vmemmap_optimizable(h))
+		return false;
 
 	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
 		pmd_t *pmdp, pmd;
@@ -508,73 +506,47 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h,
 		 *          +-------------------------------------------+
 		 */
 		if (PageVmemmapSelfHosted(vmemmap_page))
-			return 0;
+			return false;
 	}
 
-	return hugetlb_optimize_vmemmap_pages(h);
+	return true;
 }
 
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+/**
+ * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages.
+ * @h:		struct hstate.
+ * @head:	the head page whose vmemmap pages will be optimized.
+ *
+ * This function only tries to optimize @head's vmemmap pages and does not
+ * guarantee that the optimization will succeed after it returns. The caller
+ * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages
+ * have been optimized.
+ */
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
-	unsigned long vmemmap_addr = (unsigned long)head;
-	unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
+	unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
+	unsigned long vmemmap_reuse;
 
-	vmemmap_pages = vmemmap_optimizable_pages(h, head);
-	if (!vmemmap_pages)
+	if (!vmemmap_should_optimize(h, head))
 		return;
 
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
-	vmemmap_addr	+= RESERVE_VMEMMAP_SIZE;
-	vmemmap_end	= vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
-	vmemmap_reuse	= vmemmap_addr - PAGE_SIZE;
+	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
+	vmemmap_reuse	= vmemmap_start;
+	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
 
 	/*
-	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 	 * to the page which @vmemmap_reuse is mapped to, then free the pages
-	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+	 * which the range [@vmemmap_start, @vmemmap_end] is mapped to.
 	 */
-	if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+	if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse))
 		static_branch_dec(&hugetlb_optimize_vmemmap_key);
 	else
 		SetHPageVmemmapOptimized(head);
 }
 
-void __init hugetlb_vmemmap_init(struct hstate *h)
-{
-	unsigned int nr_pages = pages_per_huge_page(h);
-	unsigned int vmemmap_pages;
-
-	/*
-	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
-	 * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON
-	 * to catch invalid usage of the tail page structs.
-	 */
-	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
-		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
-
-	if (!is_power_of_2(sizeof(struct page))) {
-		pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
-		return;
-	}
-
-	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
-	/*
-	 * The head page is not to be freed to buddy allocator, the other tail
-	 * pages will map to the head page, so they can be freed.
-	 *
-	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
-	 * on some architectures (e.g. aarch64). See Documentation/arm64/
-	 * hugetlbpage.rst for more details.
-	 */
-	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
-		h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
-
-	pr_info("can optimize %d vmemmap pages for %s\n",
-		h->optimize_vmemmap_pages, h->name);
-}
-
-#ifdef CONFIG_PROC_SYSCTL
 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{
 		.procname	= "hugetlb_optimize_vmemmap",
@@ -586,16 +558,21 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 	{ }
 };
 
-static __init int hugetlb_vmemmap_sysctls_init(void)
+static int __init hugetlb_vmemmap_init(void)
 {
-	/*
-	 * If "struct page" crosses page boundaries, the vmemmap pages cannot
-	 * be optimized.
-	 */
-	if (is_power_of_2(sizeof(struct page)))
-		register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
-
+	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
+	BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE);
+
+	if (IS_ENABLED(CONFIG_PROC_SYSCTL)) {
+		const struct hstate *h;
+
+		for_each_hstate(h) {
+			if (hugetlb_vmemmap_optimizable(h)) {
+				register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+				break;
+			}
+		}
+	}
 	return 0;
 }
-late_initcall(hugetlb_vmemmap_sysctls_init);
-#endif /* CONFIG_PROC_SYSCTL */
+late_initcall(hugetlb_vmemmap_init);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index ba66fadad9fc..25bd0e002431 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -11,35 +11,50 @@
 #include <linux/hugetlb.h>
 
 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
-void hugetlb_vmemmap_init(struct hstate *h);
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
 
 /*
- * How many vmemmap pages associated with a HugeTLB page that can be
- * optimized and freed to the buddy allocator.
+ * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
+ * Documentation/vm/vmemmap_dedup.rst.
  */
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+#define HUGETLB_VMEMMAP_RESERVE_SIZE	PAGE_SIZE
+
+static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
 {
-	return h->optimize_vmemmap_pages;
+	return pages_per_huge_page(h) * sizeof(struct page);
+}
+
+/*
+ * Return how many vmemmap size associated with a HugeTLB page that can be
+ * optimized and can be freed to the buddy allocator.
+ */
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+	int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;
+
+	if (!is_power_of_2(sizeof(struct page)))
+		return 0;
+	return size > 0 ? size : 0;
 }
 #else
-static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 {
 	return 0;
 }
 
-static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
 {
 }
 
-static inline void hugetlb_vmemmap_init(struct hstate *h)
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
 {
+	return 0;
 }
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
-static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
+static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
 {
-	return 0;
+	return hugetlb_vmemmap_optimizable_size(h) != 0;
 }
-#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
-- 
cgit v1.2.3


From 838691a1c0ec44739db558834e6954d62577d6b8 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Tue, 28 Jun 2022 17:22:34 +0800
Subject: mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst

All the comments which explains how HVO works are moved to
vmemmap_dedup.rst since

  commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps")

except some comments above page_fixed_fake_head().  This commit moves
those comments to vmemmap_dedup.rst and improve vmemmap_dedup.rst as well.

Link: https://lkml.kernel.org/r/20220628092235.91270-8-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/vmemmap_dedup.rst | 70 +++++++++++++++++++++++++-------------
 include/linux/page-flags.h         | 15 ++------
 2 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index 7d7a161aa364..a4b12ff906c4 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -9,23 +9,23 @@ HugeTLB
 
 This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
 
-The struct page structures (page structs) are used to describe a physical
-page frame. By default, there is a one-to-one mapping from a page frame to
-it's corresponding page struct.
+The ``struct page`` structures are used to describe a physical page frame. By
+default, there is a one-to-one mapping from a page frame to it's corresponding
+``struct page``.
 
 HugeTLB pages consist of multiple base page size pages and is supported by many
 architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
 details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
 currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
 consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
-For each base page, there is a corresponding page struct.
+For each base page, there is a corresponding ``struct page``.
 
-Within the HugeTLB subsystem, only the first 4 page structs are used to
-contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
-this upper limit. The only 'useful' information in the remaining page structs
+Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to
+contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides
+this upper limit. The only 'useful' information in the remaining ``struct page``
 is the compound_head field, and this field is the same for all tail pages.
 
-By removing redundant page structs for HugeTLB pages, memory can be returned
+By removing redundant ``struct page`` for HugeTLB pages, memory can be returned
 to the buddy allocator for other uses.
 
 Different architectures support different HugeTLB pages. For example, the
@@ -46,7 +46,7 @@ page.
 |              |   64KB    |    2MB    |  512MB    |    16GB   |           |
 +--------------+-----------+-----------+-----------+-----------+-----------+
 
-When the system boot up, every HugeTLB page has more than one struct page
+When the system boot up, every HugeTLB page has more than one ``struct page``
 structs which size is (unit: pages)::
 
    struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
@@ -76,10 +76,10 @@ Where n is how many pte entries which one page can contains. So the value of
 n is (PAGE_SIZE / sizeof(pte_t)).
 
 This optimization only supports 64-bit system, so the value of sizeof(pte_t)
-is 8. And this optimization also applicable only when the size of struct page
-is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+is 8. And this optimization also applicable only when the size of ``struct page``
+is a power of two. In most cases, the size of ``struct page`` is 64 bytes (e.g.
 x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
-size of struct page structs of it is 8 page frames which size depends on the
+size of ``struct page`` structs of it is 8 page frames which size depends on the
 size of the base page.
 
 For the HugeTLB page of the pud level mapping, then::
@@ -88,7 +88,7 @@ For the HugeTLB page of the pud level mapping, then::
                = PAGE_SIZE / 8 * 8 (pages)
                = PAGE_SIZE (pages)
 
-Where the struct_size(pmd) is the size of the struct page structs of a
+Where the struct_size(pmd) is the size of the ``struct page`` structs of a
 HugeTLB page of the pmd level mapping.
 
 E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
@@ -96,7 +96,7 @@ HugeTLB page consists in 4096.
 
 Next, we take the pmd level mapping of the HugeTLB page as an example to
 show the internal implementation of this optimization. There are 8 pages
-struct page structs associated with a HugeTLB page which is pmd mapped.
+``struct page`` structs associated with a HugeTLB page which is pmd mapped.
 
 Here is how things look before optimization::
 
@@ -124,10 +124,10 @@ Here is how things look before optimization::
  +-----------+
 
 The value of page->compound_head is the same for all tail pages. The first
-page of page structs (page 0) associated with the HugeTLB page contains the 4
-page structs necessary to describe the HugeTLB. The only use of the remaining
-pages of page structs (page 1 to page 7) is to point to page->compound_head.
-Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
+page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4
+``struct page`` necessary to describe the HugeTLB. The only use of the remaining
+pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head.
+Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page``
 will be used for each HugeTLB page. This will allow us to free the remaining
 7 pages to the buddy allocator.
 
@@ -169,13 +169,37 @@ entries that can be cached in a single TLB entry.
 
 The contiguous bit is used to increase the mapping size at the pmd and pte
 (last) level. So this type of HugeTLB page can be optimized only when its
-size of the struct page structs is greater than 1 page.
+size of the ``struct page`` structs is greater than **1** page.
 
 Notice: The head vmemmap page is not freed to the buddy allocator and all
 tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
-more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
-associated with each HugeTLB page. The compound_head() can handle this
-correctly (more details refer to the comment above compound_head()).
+more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB
+page) associated with each HugeTLB page. The ``compound_head()`` can handle
+this correctly. There is only **one** head ``struct page``, the tail
+``struct page`` with ``PG_head`` are fake head ``struct page``.  We need an
+approach to distinguish between those two different types of ``struct page`` so
+that ``compound_head()`` can return the real head ``struct page`` when the
+parameter is the tail ``struct page`` but with ``PG_head``. The following code
+snippet describes how to distinguish between real and fake head ``struct page``.
+
+.. code-block:: c
+
+	if (test_bit(PG_head, &page->flags)) {
+		unsigned long head = READ_ONCE(page[1].compound_head);
+
+		if (head & 1) {
+			if (head == (unsigned long)page + 1)
+				/* head struct page */
+			else
+				/* tail struct page */
+		} else {
+			/* head struct page */
+		}
+	}
+
+We can safely access the field of the **page[1]** with ``PG_head`` because the
+page is a compound page composed with at least two contiguous pages.
+The implementation refers to ``page_fixed_fake_head()``.
 
 Device DAX
 ==========
@@ -189,7 +213,7 @@ PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
 
 The differences with HugeTLB are relatively minor.
 
-It only use 3 page structs for storing all information as opposed
+It only use 3 ``struct page`` for storing all information as opposed
 to 4 on HugeTLB pages.
 
 There's no remapping of vmemmap given that device-dax memory is not part of
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 78ed46ae6ee5..465ff35a8c00 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,19 +208,8 @@ enum pageflags {
 DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 
 /*
- * If HVO is enabled, the head vmemmap page frame is reused and all of the tail
- * vmemmap addresses map to the head vmemmap page frame (furture details can
- * refer to the figure at the head of the mm/hugetlb_vmemmap.c).  In other
- * words, there are more than one page struct with PG_head associated with each
- * HugeTLB page.  We __know__ that there is only one head page struct, the tail
- * page structs with PG_head are fake head page structs.  We need an approach
- * to distinguish between those two different types of page structs so that
- * compound_head() can return the real head page struct when the parameter is
- * the tail page struct but with PG_head.
- *
- * The page_fixed_fake_head() returns the real head page struct if the @page is
- * fake page head, otherwise, returns @page which can either be a true page
- * head or tail.
+ * Return the real head page struct iff the @page is a fake head page, otherwise
+ * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
  */
 static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
 {
-- 
cgit v1.2.3


From 161df60e9e89651c9aa3ae0edc9aae3a8a2d21e7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:15 +0900
Subject: mm, hwpoison, hugetlb: support saving mechanism of raw error pages

When handling memory error on a hugetlb page, the error handler tries to
dissolve and turn it into 4kB pages.  If it's successfully dissolved,
PageHWPoison flag is moved to the raw error page, so that's all right.
However, dissolve sometimes fails, then the error page is left as
hwpoisoned hugepage.  It's useful if we can retry to dissolve it to save
healthy pages, but that's not possible now because the information about
where the raw error pages is lost.

Use the private field of a few tail pages to keep that information.  The
code path of shrinking hugepage pool uses this info to try delayed
dissolve.  In order to remember multiple errors in a hugepage, a
singly-linked list originated from SUBPAGE_INDEX_HWPOISON-th tail page is
constructed.  Only simple operations (adding an entry or clearing all) are
required and the list is assumed not to be very long, so this simple data
structure should be enough.

If we failed to save raw error info, the hwpoison hugepage has errors on
unknown subpage, then this new saving mechanism does not work any more, so
disable saving new raw error info and freeing hwpoison hugepages.

Link: https://lkml.kernel.org/r/20220714042420.1847125-4-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 +++++++++-
 mm/hugetlb.c            | 23 ++++++++-----
 mm/memory-failure.c     | 89 +++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 116 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6d0620edf0a6..3ec981a0d8b3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -42,6 +42,9 @@ enum {
 	SUBPAGE_INDEX_CGROUP,		/* reuse page->private */
 	SUBPAGE_INDEX_CGROUP_RSVD,	/* reuse page->private */
 	__MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+	SUBPAGE_INDEX_HWPOISON,
 #endif
 	__NR_USED_SUBPAGE,
 };
@@ -551,7 +554,7 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  *	Synchronization:  Initially set after new page allocation with no
  *	locking.  When examined and modified during migration processing
  *	(isolate, migrate, putback) the hugetlb_lock is held.
- * HPG_temporary - - Set on a page that is temporarily allocated from the buddy
+ * HPG_temporary - Set on a page that is temporarily allocated from the buddy
  *	allocator.  Typically used for migration target pages when no pages
  *	are available in the pool.  The hugetlb free page path will
  *	immediately free pages with this flag set to the buddy allocator.
@@ -561,6 +564,8 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  * HPG_freed - Set when page is on the free lists.
  *	Synchronization: hugetlb_lock held for examination and modification.
  * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
+ * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
+ *     that is not tracked by raw_hwp_page list.
  */
 enum hugetlb_page_flags {
 	HPG_restore_reserve = 0,
@@ -568,6 +573,7 @@ enum hugetlb_page_flags {
 	HPG_temporary,
 	HPG_freed,
 	HPG_vmemmap_optimized,
+	HPG_raw_hwp_unreliable,
 	__NR_HPAGEFLAGS,
 };
 
@@ -614,6 +620,7 @@ HPAGEFLAG(Migratable, migratable)
 HPAGEFLAG(Temporary, temporary)
 HPAGEFLAG(Freed, freed)
 HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
+HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
@@ -796,6 +803,14 @@ extern int dissolve_free_huge_page(struct page *page);
 extern int dissolve_free_huge_pages(unsigned long start_pfn,
 				    unsigned long end_pfn);
 
+#ifdef CONFIG_MEMORY_FAILURE
+extern void hugetlb_clear_page_hwpoison(struct page *hpage);
+#else
+static inline void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+}
+#endif
+
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 #ifndef arch_hugetlb_migration_supported
 static inline bool arch_hugetlb_migration_supported(struct hstate *h)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9b79806be17b..0aee2f3ae15c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1535,6 +1535,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
 		return;
 
+	/*
+	 * If we don't know which subpages are hwpoisoned, we can't free
+	 * the hugepage, so it's leaked intentionally.
+	 */
+	if (HPageRawHwpUnreliable(page))
+		return;
+
 	if (hugetlb_vmemmap_restore(h, page)) {
 		spin_lock_irq(&hugetlb_lock);
 		/*
@@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
 		return;
 	}
 
+	/*
+	 * Move PageHWPoison flag from head page to the raw error pages,
+	 * which makes any healthy subpages reusable.
+	 */
+	if (unlikely(PageHWPoison(page)))
+		hugetlb_clear_page_hwpoison(page);
+
 	for (i = 0; i < pages_per_huge_page(h);
 	     i++, subpage = mem_map_next(subpage, page, i)) {
 		subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -2109,15 +2123,6 @@ retry:
 		 */
 		rc = hugetlb_vmemmap_restore(h, head);
 		if (!rc) {
-			/*
-			 * Move PageHWPoison flag from head page to the raw
-			 * error page, which makes any subpages rather than
-			 * the error page reusable.
-			 */
-			if (PageHWPoison(head) && page != head) {
-				SetPageHWPoison(page);
-				ClearPageHWPoison(head);
-			}
 			update_and_free_page(h, head, false);
 		} else {
 			spin_lock_irq(&hugetlb_lock);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9a7a228ad04a..61668ce20836 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1662,6 +1662,90 @@ unlock:
 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
 #endif /* CONFIG_FS_DAX */
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list originated from ->private field of
+ * SUBPAGE_INDEX_HWPOISON-th tail page.
+ */
+struct raw_hwp_page {
+	struct llist_node node;
+	struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+{
+	return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+}
+
+static void __free_raw_hwp_pages(struct page *hpage)
+{
+	struct llist_head *head;
+	struct llist_node *t, *tnode;
+
+	head = raw_hwp_list_head(hpage);
+	llist_for_each_safe(tnode, t, head->first) {
+		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+		SetPageHWPoison(p->page);
+		kfree(p);
+	}
+	llist_del_all(head);
+}
+
+static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+{
+	struct llist_head *head;
+	struct raw_hwp_page *raw_hwp;
+	struct llist_node *t, *tnode;
+	int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+
+	/*
+	 * Once the hwpoison hugepage has lost reliable raw error info,
+	 * there is little meaning to keep additional error info precisely,
+	 * so skip to add additional raw error info.
+	 */
+	if (HPageRawHwpUnreliable(hpage))
+		return -EHWPOISON;
+	head = raw_hwp_list_head(hpage);
+	llist_for_each_safe(tnode, t, head->first) {
+		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+		if (p->page == page)
+			return -EHWPOISON;
+	}
+
+	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+	if (raw_hwp) {
+		raw_hwp->page = page;
+		llist_add(&raw_hwp->node, head);
+		/* the first error event will be counted in action_result(). */
+		if (ret)
+			num_poisoned_pages_inc();
+	} else {
+		/*
+		 * Failed to save raw error info.  We no longer trace all
+		 * hwpoisoned subpages, and we need refuse to free/dissolve
+		 * this hwpoisoned hugepage.
+		 */
+		SetHPageRawHwpUnreliable(hpage);
+		/*
+		 * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+		 * used any more, so free it.
+		 */
+		__free_raw_hwp_pages(hpage);
+	}
+	return ret;
+}
+
+void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+	if (HPageRawHwpUnreliable(hpage))
+		return;
+	ClearPageHWPoison(hpage);
+	__free_raw_hwp_pages(hpage);
+}
+
 /*
  * Called from hugetlb code with hugetlb_lock held.
  *
@@ -1696,7 +1780,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 		goto out;
 	}
 
-	if (TestSetPageHWPoison(head)) {
+	if (hugetlb_set_page_hwpoison(head, page)) {
 		ret = -EHWPOISON;
 		goto out;
 	}
@@ -1708,7 +1792,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
 /*
  * Taking refcount of hugetlb pages needs extra care about race conditions
  * with basic operations like hugepage allocation/free/demotion.
@@ -1749,7 +1832,7 @@ retry:
 	lock_page(head);
 
 	if (hwpoison_filter(p)) {
-		ClearPageHWPoison(head);
+		hugetlb_clear_page_hwpoison(head);
 		res = -EOPNOTSUPP;
 		goto out;
 	}
-- 
cgit v1.2.3


From ac5fcde0a96a18773f06b7c00c5ea081bbdc64b3 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:16 +0900
Subject: mm, hwpoison: make unpoison aware of raw error info in hwpoisoned
 hugepage

Raw error info list needs to be removed when hwpoisoned hugetlb is
unpoisoned.  And unpoison handler needs to know how many errors there are
in the target hugepage.  So add them.

HPageVmemmapOptimized(hpage) and HPageRawHwpUnreliable(hpage)) sometimes
can't be unpoisoned, so skip them.

Link: https://lkml.kernel.org/r/20220714042420.1847125-5-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swapops.h |  9 +++++++++
 mm/memory-failure.c     | 52 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 56 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bb7afd03a324..a3d435bf9f97 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -490,6 +490,11 @@ static inline void num_poisoned_pages_dec(void)
 	atomic_long_dec(&num_poisoned_pages);
 }
 
+static inline void num_poisoned_pages_sub(long i)
+{
+	atomic_long_sub(i, &num_poisoned_pages);
+}
+
 #else
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -505,6 +510,10 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 static inline void num_poisoned_pages_inc(void)
 {
 }
+
+static inline void num_poisoned_pages_sub(long i)
+{
+}
 #endif
 
 static inline int non_swap_entry(swp_entry_t entry)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 61668ce20836..e30dbeca09d1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1678,19 +1678,23 @@ static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
 	return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
 }
 
-static void __free_raw_hwp_pages(struct page *hpage)
+static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
 {
 	struct llist_head *head;
 	struct llist_node *t, *tnode;
+	unsigned long count = 0;
 
 	head = raw_hwp_list_head(hpage);
 	llist_for_each_safe(tnode, t, head->first) {
 		struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
 
-		SetPageHWPoison(p->page);
+		if (move_flag)
+			SetPageHWPoison(p->page);
 		kfree(p);
+		count++;
 	}
 	llist_del_all(head);
+	return count;
 }
 
 static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
@@ -1733,17 +1737,36 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
 		 * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
 		 * used any more, so free it.
 		 */
-		__free_raw_hwp_pages(hpage);
+		__free_raw_hwp_pages(hpage, false);
 	}
 	return ret;
 }
 
+static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+	/*
+	 * HPageVmemmapOptimized hugepages can't be freed because struct
+	 * pages for tail pages are required but they don't exist.
+	 */
+	if (move_flag && HPageVmemmapOptimized(hpage))
+		return 0;
+
+	/*
+	 * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+	 * definition.
+	 */
+	if (HPageRawHwpUnreliable(hpage))
+		return 0;
+
+	return __free_raw_hwp_pages(hpage, move_flag);
+}
+
 void hugetlb_clear_page_hwpoison(struct page *hpage)
 {
 	if (HPageRawHwpUnreliable(hpage))
 		return;
 	ClearPageHWPoison(hpage);
-	__free_raw_hwp_pages(hpage);
+	free_raw_hwp_pages(hpage, true);
 }
 
 /*
@@ -1887,6 +1910,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *
 	return 0;
 }
 
+static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+{
+	return 0;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
@@ -2292,6 +2319,7 @@ int unpoison_memory(unsigned long pfn)
 	struct page *p;
 	int ret = -EBUSY;
 	int freeit = 0;
+	unsigned long count = 1;
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
 
@@ -2339,6 +2367,13 @@ int unpoison_memory(unsigned long pfn)
 
 	ret = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ret) {
+		if (PageHuge(p)) {
+			count = free_raw_hwp_pages(page, false);
+			if (count == 0) {
+				ret = -EBUSY;
+				goto unlock_mutex;
+			}
+		}
 		ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
 	} else if (ret < 0) {
 		if (ret == -EHWPOISON) {
@@ -2347,6 +2382,13 @@ int unpoison_memory(unsigned long pfn)
 			unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
 					 pfn, &unpoison_rs);
 	} else {
+		if (PageHuge(p)) {
+			count = free_raw_hwp_pages(page, false);
+			if (count == 0) {
+				ret = -EBUSY;
+				goto unlock_mutex;
+			}
+		}
 		freeit = !!TestClearPageHWPoison(p);
 
 		put_page(page);
@@ -2359,7 +2401,7 @@ int unpoison_memory(unsigned long pfn)
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	if (!ret || freeit) {
-		num_poisoned_pages_dec();
+		num_poisoned_pages_sub(count);
 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
 				 page_to_pfn(p), &unpoison_rs);
 	}
-- 
cgit v1.2.3


From 38f6d29397ccb9c191c4c91103e8123f518fdc10 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:17 +0900
Subject: mm, hwpoison: set PG_hwpoison for busy hugetlb pages

If memory_failure() fails to grab page refcount on a hugetlb page because
it's busy, it returns without setting PG_hwpoison on it.  This not only
loses a chance of error containment, but breaks the rule that
action_result() should be called only when memory_failure() do any of
handling work (even if that's just setting PG_hwpoison).  This
inconsistency could harm code maintainability.

So set PG_hwpoison and call hugetlb_set_page_hwpoison() for such a case.

Link: https://lkml.kernel.org/r/20220714042420.1847125-6-naoya.horiguchi@linux.dev
Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h  | 1 +
 mm/memory-failure.c | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e6e201a4ce05..0345b8c30394 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3176,6 +3176,7 @@ enum mf_flags {
 	MF_SOFT_OFFLINE = 1 << 3,
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
+	MF_NO_RETRY = 1 << 6,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e30dbeca09d1..a748f5dd7b8e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1800,7 +1800,8 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
 			count_increased = true;
 	} else {
 		ret = -EBUSY;
-		goto out;
+		if (!(flags & MF_NO_RETRY))
+			goto out;
 	}
 
 	if (hugetlb_set_page_hwpoison(head, page)) {
@@ -1827,7 +1828,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 	struct page *p = pfn_to_page(pfn);
 	struct page *head;
 	unsigned long page_flags;
-	bool retry = true;
 
 	*hugetlb = 1;
 retry:
@@ -1843,8 +1843,8 @@ retry:
 		}
 		return res;
 	} else if (res == -EBUSY) {
-		if (retry) {
-			retry = false;
+		if (!(flags & MF_NO_RETRY)) {
+			flags |= MF_NO_RETRY;
 			goto retry;
 		}
 		action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
-- 
cgit v1.2.3


From 6f4614886baa59b6ae014093300482c1da4d3c93 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi@nec.com>
Date: Thu, 14 Jul 2022 13:24:20 +0900
Subject: mm, hwpoison: enable memory error handling on 1GB hugepage

Now error handling code is prepared, so remove the blocking code and
enable memory error handling on 1GB hugepage.

Link: https://lkml.kernel.org/r/20220714042420.1847125-9-naoya.horiguchi@linux.dev
Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h      |  1 -
 include/ras/ras_event.h |  1 -
 mm/memory-failure.c     | 16 ----------------
 3 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0345b8c30394..3bedc449c14d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3229,7 +3229,6 @@ enum mf_action_page_type {
 	MF_MSG_DIFFERENT_COMPOUND,
 	MF_MSG_HUGE,
 	MF_MSG_FREE_HUGE,
-	MF_MSG_NON_PMD_HUGE,
 	MF_MSG_UNMAP_FAILED,
 	MF_MSG_DIRTY_SWAPCACHE,
 	MF_MSG_CLEAN_SWAPCACHE,
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index d0337a41141c..cbd3ddd7c33d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -360,7 +360,6 @@ TRACE_EVENT(aer_event,
 	EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
 	EM ( MF_MSG_HUGE, "huge page" )					\
 	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
-	EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" )		\
 	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
 	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
 	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index de4aff85e466..14439806b5ef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -768,7 +768,6 @@ static const char * const action_page_types[] = {
 	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
 	[MF_MSG_HUGE]			= "huge page",
 	[MF_MSG_FREE_HUGE]		= "free huge page",
-	[MF_MSG_NON_PMD_HUGE]		= "non-pmd-sized huge page",
 	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
 	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
 	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
@@ -1885,21 +1884,6 @@ retry:
 
 	page_flags = head->flags;
 
-	/*
-	 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
-	 * simply disable it. In order to make it work properly, we need
-	 * make sure that:
-	 *  - conversion of a pud that maps an error hugetlb into hwpoison
-	 *    entry properly works, and
-	 *  - other mm code walking over page table is aware of pud-aligned
-	 *    hwpoison entries.
-	 */
-	if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
-		action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
-		res = -EBUSY;
-		goto out;
-	}
-
 	if (!hwpoison_user_mappings(p, pfn, flags, head)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
-- 
cgit v1.2.3


From 729337bc20876af348b363b3e35fb19be71ba793 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:38 +0200
Subject: highmem: remove unneeded spaces in kmap_local_page() kdocs

Patch series "highmem: Extend kmap_local_page() documentation", v2.

The Highmem interface is evolving and the current documentation does not
reflect the intended uses of each of the calls.  Furthermore, after a
recent series of reworks, the differences of the calls can still be
confusing and may lead to the expanded use of calls which are deprecated.

This series is the second round of changes towards an enhanced
documentation of the Highmem's interface; at this stage the patches are
only focused to kmap_local_page().

In addition it also contains some minor clean ups.


This patch (of 7):

In the kdocs of kmap_local_page(), the description of @page starts after
several unnecessary spaces.

Therefore, remove those spaces.

Link: https://lkml.kernel.org/r/20220728154844.10874-1-fmdefrancesco@gmail.com
Link: https://lkml.kernel.org/r/20220728154844.10874-2-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 177b07944640..0bd5ed4ac391 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -60,7 +60,7 @@ static inline void kmap_flush_unused(void);
 
 /**
  * kmap_local_page - Map a page for temporary usage
- * @page:	Pointer to the page to be mapped
+ * @page: Pointer to the page to be mapped
  *
  * Returns: The virtual address of the mapping
  *
-- 
cgit v1.2.3


From 383bbef283920411379c5c93829102ff7859fea5 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:39 +0200
Subject: highmem: specify that kmap_local_page() is callable from interrupts

In a recent thread about converting kmap() to kmap_local_page(), the
safety of calling kmap_local_page() was questioned.[1]

"any context" should probably be enough detail for users who want to know
whether or not kmap_local_page() can be called from interrupts.  However,
Linux still has kmap_atomic() which might make users think they must use
the latter in interrupts.

Add "including interrupts" for better clarity.

[1] https://lore.kernel.org/lkml/3187836.aeNJFYEL58@opensuse/

Link: https://lkml.kernel.org/r/20220728154844.10874-3-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0bd5ed4ac391..4a46d95ff6c8 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -64,7 +64,7 @@ static inline void kmap_flush_unused(void);
  *
  * Returns: The virtual address of the mapping
  *
- * Can be invoked from any context.
+ * Can be invoked from any context, including interrupts.
  *
  * Requires careful handling when nesting multiple mappings because the map
  * management is stack based. The unmap has to be in the reverse order of
-- 
cgit v1.2.3


From 72f1c55adf70fd08ceac6b67455238db2014894a Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fmdefrancesco@gmail.com>
Date: Thu, 28 Jul 2022 17:48:43 +0200
Subject: highmem: delete a sentence from kmap_local_page() kdocs

kmap_local_page() should always be preferred in place of kmap() and
kmap_atomic().  "Only use when really necessary." is not consistent with
the Documentation/mm/highmem.rst and these kdocs it embeds.

Therefore, delete the above-mentioned sentence from kdocs.

Link: https://lkml.kernel.org/r/20220728154844.10874-7-fmdefrancesco@gmail.com
Signed-off-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Suggested-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 4a46d95ff6c8..25679035ca28 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,8 +86,7 @@ static inline void kmap_flush_unused(void);
  * temporarily mapped.
  *
  * While it is significantly faster than kmap() for the higmem case it
- * comes with restrictions about the pointer validity. Only use when really
- * necessary.
+ * comes with restrictions about the pointer validity.
  *
  * On HIGHMEM enabled systems mapping a highmem page has the side effect of
  * disabling migration in order to keep the virtual address stable across
-- 
cgit v1.2.3


From fcb14cb1bdacec5b4374fe161e83fb8208164a85 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 22 May 2022 14:59:25 -0400
Subject: new iov_iter flavour - ITER_UBUF

Equivalent of single-segment iovec.  Initialized by iov_iter_ubuf(),
checked for by iter_is_ubuf(), otherwise behaves like ITER_IOVEC
ones.

We are going to expose the things like ->write_iter() et.al. to those
in subsequent commits.

New predicate (user_backed_iter()) that is true for ITER_IOVEC and
ITER_UBUF; places like direct-IO handling should use that for
checking that pages we modify after getting them from iov_iter_get_pages()
would need to be dirtied.

DO NOT assume that replacing iter_is_iovec() with user_backed_iter()
will solve all problems - there's code that uses iter_is_iovec() to
decide how to poke around in iov_iter guts and for that the predicate
replacement obviously won't suffice.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/fops.c         |  6 ++--
 fs/ceph/file.c       |  2 +-
 fs/cifs/file.c       |  2 +-
 fs/direct-io.c       |  2 +-
 fs/fuse/dev.c        |  4 +--
 fs/fuse/file.c       |  2 +-
 fs/gfs2/file.c       |  2 +-
 fs/iomap/direct-io.c |  2 +-
 fs/nfs/direct.c      |  2 +-
 include/linux/uio.h  | 26 ++++++++++++++++
 lib/iov_iter.c       | 87 +++++++++++++++++++++++++++++++++++++++++-----------
 mm/shmem.c           |  2 +-
 12 files changed, 108 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/fops.c b/block/fops.c
index a564cd81340c..b90742595317 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -75,7 +75,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 
 	if (iov_iter_rw(iter) == READ) {
 		bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
-		if (iter_is_iovec(iter))
+		if (user_backed_iter(iter))
 			should_dirty = true;
 	} else {
 		bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
@@ -204,7 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	dio->size = 0;
-	if (is_read && iter_is_iovec(iter))
+	if (is_read && user_backed_iter(iter))
 		dio->flags |= DIO_SHOULD_DIRTY;
 
 	blk_start_plug(&plug);
@@ -335,7 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	dio->size = bio->bi_iter.bi_size;
 
 	if (is_read) {
-		if (iter_is_iovec(iter)) {
+		if (user_backed_iter(iter)) {
 			dio->flags |= DIO_SHOULD_DIRTY;
 			bio_set_pages_dirty(bio);
 		}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index da59e836a06e..c535de5852bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1262,7 +1262,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
 	bool write = iov_iter_rw(iter) == WRITE;
-	bool should_dirty = !write && iter_is_iovec(iter);
+	bool should_dirty = !write && user_backed_iter(iter);
 
 	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e64cda7a7610..e1e05b253daa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4004,7 +4004,7 @@ static ssize_t __cifs_readv(
 	if (!is_sync_kiocb(iocb))
 		ctx->iocb = iocb;
 
-	if (iter_is_iovec(to))
+	if (user_backed_iter(to))
 		ctx->should_dirty = true;
 
 	if (direct) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index df5e2d048799..c7fc01c2d509 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1251,7 +1251,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
-	dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
+	dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
 	sdio.iter = iter;
 	sdio.final_block_in_request = end >> blkbits;
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0e537e580dc1..8d657c2cd6f7 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1356,7 +1356,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
 	if (!fud)
 		return -EPERM;
 
-	if (!iter_is_iovec(to))
+	if (!user_backed_iter(to))
 		return -EINVAL;
 
 	fuse_copy_init(&cs, 1, to);
@@ -1949,7 +1949,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
 	if (!fud)
 		return -EPERM;
 
-	if (!iter_is_iovec(from))
+	if (!user_backed_iter(from))
 		return -EINVAL;
 
 	fuse_copy_init(&cs, 0, from);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 00fa861aeead..c982e3afe3b4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1465,7 +1465,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 			inode_unlock(inode);
 	}
 
-	io->should_dirty = !write && iter_is_iovec(iter);
+	io->should_dirty = !write && user_backed_iter(iter);
 	while (count) {
 		ssize_t nres;
 		fl_owner_t owner = current->files;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2cceb193dcd8..48e6cc74fdc1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -780,7 +780,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
 
 	if (!count)
 		return false;
-	if (!iter_is_iovec(i))
+	if (!user_backed_iter(i))
 		return false;
 
 	size = PAGE_SIZE;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c75d33d5c3ce..4eb559a16c9e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -533,7 +533,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			iomi.flags |= IOMAP_NOWAIT;
 		}
 
-		if (iter_is_iovec(iter))
+		if (user_backed_iter(iter))
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
 		iomi.flags |= IOMAP_WRITE;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4eb2a8380a28..022e1ce63e62 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -478,7 +478,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
-	if (iter_is_iovec(iter))
+	if (user_backed_iter(iter))
 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 
 	if (!swap)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 9a2dc496d535..85bef84fd294 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -26,6 +26,7 @@ enum iter_type {
 	ITER_PIPE,
 	ITER_XARRAY,
 	ITER_DISCARD,
+	ITER_UBUF,
 };
 
 struct iov_iter_state {
@@ -38,6 +39,7 @@ struct iov_iter {
 	u8 iter_type;
 	bool nofault;
 	bool data_source;
+	bool user_backed;
 	size_t iov_offset;
 	size_t count;
 	union {
@@ -46,6 +48,7 @@ struct iov_iter {
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
 		struct pipe_inode_info *pipe;
+		void __user *ubuf;
 	};
 	union {
 		unsigned long nr_segs;
@@ -70,6 +73,11 @@ static inline void iov_iter_save_state(struct iov_iter *iter,
 	state->nr_segs = iter->nr_segs;
 }
 
+static inline bool iter_is_ubuf(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_UBUF;
+}
+
 static inline bool iter_is_iovec(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_IOVEC;
@@ -105,6 +113,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->data_source ? WRITE : READ;
 }
 
+static inline bool user_backed_iter(const struct iov_iter *i)
+{
+	return i->user_backed;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *
@@ -322,4 +335,17 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 int import_single_range(int type, void __user *buf, size_t len,
 		 struct iovec *iov, struct iov_iter *i);
 
+static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
+			void __user *buf, size_t count)
+{
+	WARN_ON(direction & ~(READ | WRITE));
+	*i = (struct iov_iter) {
+		.iter_type = ITER_UBUF,
+		.user_backed = true,
+		.data_source = direction,
+		.ubuf = buf,
+		.count = count
+	};
+}
+
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0e0be334dbee..b3493d20536e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -16,6 +16,16 @@
 
 #define PIPE_PARANOIA /* for now */
 
+/* covers ubuf and kbuf alike */
+#define iterate_buf(i, n, base, len, off, __p, STEP) {		\
+	size_t __maybe_unused off = 0;				\
+	len = n;						\
+	base = __p + i->iov_offset;				\
+	len -= (STEP);						\
+	i->iov_offset += len;					\
+	n = len;						\
+}
+
 /* covers iovec and kvec alike */
 #define iterate_iovec(i, n, base, len, off, __p, STEP) {	\
 	size_t off = 0;						\
@@ -110,7 +120,12 @@ __out:								\
 	if (unlikely(i->count < n))				\
 		n = i->count;					\
 	if (likely(n)) {					\
-		if (likely(iter_is_iovec(i))) {			\
+		if (likely(iter_is_ubuf(i))) {			\
+			void __user *base;			\
+			size_t len;				\
+			iterate_buf(i, n, base, len, off,	\
+						i->ubuf, (I)) 	\
+		} else if (likely(iter_is_iovec(i))) {		\
 			const struct iovec *iov = i->iov;	\
 			void __user *base;			\
 			size_t len;				\
@@ -275,7 +290,11 @@ out:
  */
 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
 {
-	if (iter_is_iovec(i)) {
+	if (iter_is_ubuf(i)) {
+		size_t n = min(size, iov_iter_count(i));
+		n -= fault_in_readable(i->ubuf + i->iov_offset, n);
+		return size - n;
+	} else if (iter_is_iovec(i)) {
 		size_t count = min(size, iov_iter_count(i));
 		const struct iovec *p;
 		size_t skip;
@@ -314,7 +333,11 @@ EXPORT_SYMBOL(fault_in_iov_iter_readable);
  */
 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
 {
-	if (iter_is_iovec(i)) {
+	if (iter_is_ubuf(i)) {
+		size_t n = min(size, iov_iter_count(i));
+		n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
+		return size - n;
+	} else if (iter_is_iovec(i)) {
 		size_t count = min(size, iov_iter_count(i));
 		const struct iovec *p;
 		size_t skip;
@@ -345,6 +368,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 	*i = (struct iov_iter) {
 		.iter_type = ITER_IOVEC,
 		.nofault = false,
+		.user_backed = true,
 		.data_source = direction,
 		.iov = iov,
 		.nr_segs = nr_segs,
@@ -494,7 +518,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyout(base, addr + off, len),
@@ -583,7 +607,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_mc_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	__iterate_and_advance(i, bytes, base, len, off,
 		copyout_mc(base, addr + off, len),
@@ -601,7 +625,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		WARN_ON(1);
 		return 0;
 	}
-	if (iter_is_iovec(i))
+	if (user_backed_iter(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyin(addr + off, base, len),
@@ -894,16 +918,16 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 {
 	if (unlikely(i->count < size))
 		size = i->count;
-	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
+	if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
+		i->iov_offset += size;
+		i->count -= size;
+	} else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
 		/* iovec and kvec have identical layouts */
 		iov_iter_iovec_advance(i, size);
 	} else if (iov_iter_is_bvec(i)) {
 		iov_iter_bvec_advance(i, size);
 	} else if (iov_iter_is_pipe(i)) {
 		pipe_advance(i, size);
-	} else if (unlikely(iov_iter_is_xarray(i))) {
-		i->iov_offset += size;
-		i->count -= size;
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
 	}
@@ -950,7 +974,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 		return;
 	}
 	unroll -= i->iov_offset;
-	if (iov_iter_is_xarray(i)) {
+	if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
 		BUG(); /* We should never go beyond the start of the specified
 			* range since we might then be straying into pages that
 			* aren't pinned.
@@ -1158,6 +1182,14 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 			 unsigned len_mask)
 {
+	if (likely(iter_is_ubuf(i))) {
+		if (i->count & len_mask)
+			return false;
+		if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
+			return false;
+		return true;
+	}
+
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_iter_aligned_iovec(i, addr_mask, len_mask);
 
@@ -1233,6 +1265,13 @@ static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
 
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
+	if (likely(iter_is_ubuf(i))) {
+		size_t size = i->count;
+		if (size)
+			return ((unsigned long)i->ubuf + i->iov_offset) | size;
+		return 0;
+	}
+
 	/* iovec and kvec have identical layouts */
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_iter_alignment_iovec(i);
@@ -1263,6 +1302,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 	size_t size = i->count;
 	unsigned k;
 
+	if (iter_is_ubuf(i))
+		return 0;
+
 	if (WARN_ON(!iter_is_iovec(i)))
 		return ~0U;
 
@@ -1385,12 +1427,15 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
 	return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
 }
 
-/* must be done on non-empty ITER_IOVEC one */
+/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
 {
 	size_t skip;
 	long k;
 
+	if (iter_is_ubuf(i))
+		return (unsigned long)i->ubuf + i->iov_offset;
+
 	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
 		size_t len = i->iov[k].iov_len - skip;
 
@@ -1432,7 +1477,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 	if (maxsize > MAX_RW_COUNT)
 		maxsize = MAX_RW_COUNT;
 
-	if (likely(iter_is_iovec(i))) {
+	if (likely(user_backed_iter(i))) {
 		unsigned int gup_flags = 0;
 		unsigned long addr;
 
@@ -1559,7 +1604,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	if (maxsize > MAX_RW_COUNT)
 		maxsize = MAX_RW_COUNT;
 
-	if (likely(iter_is_iovec(i))) {
+	if (likely(user_backed_iter(i))) {
 		unsigned int gup_flags = 0;
 		unsigned long addr;
 
@@ -1715,6 +1760,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	if (unlikely(!i->count))
 		return 0;
+	if (likely(iter_is_ubuf(i))) {
+		unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
+		int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
+		return min(npages, maxpages);
+	}
 	/* iovec and kvec have identical layouts */
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 		return iov_npages(i, maxpages);
@@ -1749,17 +1799,16 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 		WARN_ON(1);
 		return NULL;
 	}
-	if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
-		return NULL;
 	if (iov_iter_is_bvec(new))
 		return new->bvec = kmemdup(new->bvec,
 				    new->nr_segs * sizeof(struct bio_vec),
 				    flags);
-	else
+	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
 		/* iovec and kvec have identical layout */
 		return new->iov = kmemdup(new->iov,
 				   new->nr_segs * sizeof(struct iovec),
 				   flags);
+	return NULL;
 }
 EXPORT_SYMBOL(dup_iter);
 
@@ -1953,10 +2002,12 @@ EXPORT_SYMBOL(import_single_range);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 {
 	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
-			 !iov_iter_is_kvec(i))
+			 !iov_iter_is_kvec(i) && !iter_is_ubuf(i))
 		return;
 	i->iov_offset = state->iov_offset;
 	i->count = state->count;
+	if (iter_is_ubuf(i))
+		return;
 	/*
 	 * For the *vec iters, nr_segs + iov is constant - if we increment
 	 * the vec, then we also decrement the nr_segs count. Hence we don't
diff --git a/mm/shmem.c b/mm/shmem.c
index e5e43b990fdc..e3a7e171bbd1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2602,7 +2602,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			ret = copy_page_to_iter(page, offset, nr, to);
 			put_page(page);
 
-		} else if (iter_is_iovec(to)) {
+		} else if (user_backed_iter(to)) {
 			/*
 			 * Copy to user tends to be so well optimized, but
 			 * clear_user() not so much, that it is noticeably
-- 
cgit v1.2.3


From 10f525a8cd7a525e9fc73288bb35428c9cad5e63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Jun 2022 02:02:51 -0400
Subject: ITER_PIPE: cache the type of last buffer

We often need to find whether the last buffer is anon or not, and
currently it's rather clumsy:
	check if ->iov_offset is non-zero (i.e. that pipe is not empty)
	if so, get the corresponding pipe_buffer and check its ->ops
	if it's &default_pipe_buf_ops, we have an anon buffer.

Let's replace the use of ->iov_offset (which is nowhere near similar to
its role for other flavours) with signed field (->last_offset), with
the following rules:
	empty, no buffers occupied:		0
	anon, with bytes up to N-1 filled:	N
	zero-copy, with bytes up to N-1 filled:	-N

That way abs(i->last_offset) is equal to what used to be in i->iov_offset
and empty vs. anon vs. zero-copy can be distinguished by the sign of
i->last_offset.

	Checks for "should we extend the last buffer or should we start
a new one?" become easier to follow that way.

	Note that most of the operations can only be done in a sane
state - i.e. when the pipe has nothing past the current position of
iterator.  About the only thing that could be done outside of that
state is iov_iter_advance(), which transitions to the sane state by
truncating the pipe.  There are only two cases where we leave the
sane state:
	1) iov_iter_get_pages()/iov_iter_get_pages_alloc().  Will be
dealt with later, when we make get_pages advancing - the callers are
actually happier that way.
	2) iov_iter copied, then something is put into the copy.  Since
they share the underlying pipe, the original gets behind.  When we
decide that we are done with the copy (original is not usable until then)
we advance the original.  direct_io used to be done that way; nowadays
it operates on the original and we do iov_iter_revert() to discard
the excessive data.  At the moment there's nothing in the kernel that
could do that to ITER_PIPE iterators, so this reason for insane state
is theoretical right now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  5 +++-
 lib/iov_iter.c      | 77 ++++++++++++++++++++++++++---------------------------
 2 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 85bef84fd294..e7fc29b5ad19 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -40,7 +40,10 @@ struct iov_iter {
 	bool nofault;
 	bool data_source;
 	bool user_backed;
-	size_t iov_offset;
+	union {
+		size_t iov_offset;
+		int last_offset;
+	};
 	size_t count;
 	union {
 		const struct iovec *iov;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c2e08004a1eb..8834f3f61220 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -199,7 +199,7 @@ static bool sanity(const struct iov_iter *i)
 	unsigned int i_head = i->head;
 	unsigned int idx;
 
-	if (i->iov_offset) {
+	if (i->last_offset) {
 		struct pipe_buffer *p;
 		if (unlikely(p_occupancy == 0))
 			goto Bad;	// pipe must be non-empty
@@ -207,7 +207,7 @@ static bool sanity(const struct iov_iter *i)
 			goto Bad;	// must be at the last buffer...
 
 		p = pipe_buf(pipe, i_head);
-		if (unlikely(p->offset + p->len != i->iov_offset))
+		if (unlikely(p->offset + p->len != abs(i->last_offset)))
 			goto Bad;	// ... at the end of segment
 	} else {
 		if (i_head != p_head)
@@ -215,7 +215,7 @@ static bool sanity(const struct iov_iter *i)
 	}
 	return true;
 Bad:
-	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
+	printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset);
 	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
 			p_head, p_tail, pipe->ring_size);
 	for (idx = 0; idx < pipe->ring_size; idx++)
@@ -259,30 +259,31 @@ static void push_page(struct pipe_inode_info *pipe, struct page *page,
 	get_page(page);
 }
 
-static inline bool allocated(struct pipe_buffer *buf)
+static inline int last_offset(const struct pipe_buffer *buf)
 {
-	return buf->ops == &default_pipe_buf_ops;
+	if (buf->ops == &default_pipe_buf_ops)
+		return buf->len;	// buf->offset is 0 for those
+	else
+		return -(buf->offset + buf->len);
 }
 
 static struct page *append_pipe(struct iov_iter *i, size_t size,
 				unsigned int *off)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	size_t offset = i->iov_offset;
+	int offset = i->last_offset;
 	struct pipe_buffer *buf;
 	struct page *page;
 
-	if (offset && offset < PAGE_SIZE) {
-		// some space in the last buffer; can we add to it?
+	if (offset > 0 && offset < PAGE_SIZE) {
+		// some space in the last buffer; add to it
 		buf = pipe_buf(pipe, pipe->head - 1);
-		if (allocated(buf)) {
-			size = min_t(size_t, size, PAGE_SIZE - offset);
-			buf->len += size;
-			i->iov_offset += size;
-			i->count -= size;
-			*off = offset;
-			return buf->page;
-		}
+		size = min_t(size_t, size, PAGE_SIZE - offset);
+		buf->len += size;
+		i->last_offset += size;
+		i->count -= size;
+		*off = offset;
+		return buf->page;
 	}
 	// OK, we need a new buffer
 	*off = 0;
@@ -293,7 +294,7 @@ static struct page *append_pipe(struct iov_iter *i, size_t size,
 	if (!page)
 		return NULL;
 	i->head = pipe->head - 1;
-	i->iov_offset = size;
+	i->last_offset = size;
 	i->count -= size;
 	return page;
 }
@@ -313,11 +314,11 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 	if (!sanity(i))
 		return 0;
 
-	if (offset && i->iov_offset == offset) { // could we merge it?
+	if (offset && i->last_offset == -offset) { // could we merge it?
 		struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
 		if (buf->page == page) {
 			buf->len += bytes;
-			i->iov_offset += bytes;
+			i->last_offset -= bytes;
 			i->count -= bytes;
 			return bytes;
 		}
@@ -326,7 +327,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 		return 0;
 
 	push_page(pipe, page, offset, bytes);
-	i->iov_offset = offset + bytes;
+	i->last_offset = -(offset + bytes);
 	i->head = head;
 	i->count -= bytes;
 	return bytes;
@@ -438,16 +439,15 @@ EXPORT_SYMBOL(iov_iter_init);
 static inline void data_start(const struct iov_iter *i,
 			      unsigned int *iter_headp, size_t *offp)
 {
-	unsigned int iter_head = i->head;
-	size_t off = i->iov_offset;
+	int off = i->last_offset;
 
-	if (off && (!allocated(pipe_buf(i->pipe, iter_head)) ||
-		    off == PAGE_SIZE)) {
-		iter_head++;
-		off = 0;
+	if (off > 0 && off < PAGE_SIZE) { // anon and not full
+		*iter_headp = i->pipe->head - 1;
+		*offp = off;
+	} else {
+		*iter_headp = i->pipe->head;
+		*offp = 0;
 	}
-	*iter_headp = iter_head;
-	*offp = off;
 }
 
 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
@@ -819,7 +819,7 @@ EXPORT_SYMBOL(copy_page_from_iter_atomic);
 static void pipe_advance(struct iov_iter *i, size_t size)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	unsigned int off = i->iov_offset;
+	int off = i->last_offset;
 
 	if (!off && !size) {
 		pipe_discard_from(pipe, i->start_head); // discard everything
@@ -829,10 +829,10 @@ static void pipe_advance(struct iov_iter *i, size_t size)
 	while (1) {
 		struct pipe_buffer *buf = pipe_buf(pipe, i->head);
 		if (off) /* make it relative to the beginning of buffer */
-			size += off - buf->offset;
+			size += abs(off) - buf->offset;
 		if (size <= buf->len) {
 			buf->len = size;
-			i->iov_offset = buf->offset + size;
+			i->last_offset = last_offset(buf);
 			break;
 		}
 		size -= buf->len;
@@ -916,7 +916,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 			struct pipe_buffer *b = pipe_buf(pipe, --head);
 			if (unroll < b->len) {
 				b->len -= unroll;
-				i->iov_offset = b->offset + b->len;
+				i->last_offset = last_offset(b);
 				i->head = head;
 				return;
 			}
@@ -924,7 +924,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 			pipe_buf_release(pipe, b);
 			pipe->head--;
 		}
-		i->iov_offset = 0;
+		i->last_offset = 0;
 		i->head = head;
 		return;
 	}
@@ -1027,7 +1027,7 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 		.pipe = pipe,
 		.head = pipe->head,
 		.start_head = pipe->head,
-		.iov_offset = 0,
+		.last_offset = 0,
 		.count = count
 	};
 }
@@ -1158,13 +1158,12 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 		return iov_iter_aligned_bvec(i, addr_mask, len_mask);
 
 	if (iov_iter_is_pipe(i)) {
-		unsigned int p_mask = i->pipe->ring_size - 1;
 		size_t size = i->count;
 
 		if (size & len_mask)
 			return false;
-		if (size && allocated(&i->pipe->bufs[i->head & p_mask])) {
-			if (i->iov_offset & addr_mask)
+		if (size && i->last_offset > 0) {
+			if (i->last_offset & addr_mask)
 				return false;
 		}
 
@@ -1243,8 +1242,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	if (iov_iter_is_pipe(i)) {
 		size_t size = i->count;
 
-		if (size && i->iov_offset && allocated(pipe_buf(i->pipe, i->head)))
-			return size | i->iov_offset;
+		if (size && i->last_offset > 0)
+			return size | i->last_offset;
 		return size;
 	}
 
-- 
cgit v1.2.3


From 12d426ab64a1c75f1b2ee5c33e933a4c16004049 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Jun 2022 09:44:38 -0400
Subject: ITER_PIPE: fold data_start() and pipe_space_for_user() together

All their callers are next to each other; all of them
want the total amount of pages and, possibly, the
offset in the partial final buffer.

Combine into a new helper (pipe_npages()), fix the
bogosity in pipe_space_for_user(), while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/pipe_fs_i.h | 20 --------------------
 lib/iov_iter.c            | 44 +++++++++++++++++++-------------------------
 2 files changed, 19 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4ea496924106..6cb65df3e3ba 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -156,26 +156,6 @@ static inline bool pipe_full(unsigned int head, unsigned int tail,
 	return pipe_occupancy(head, tail) >= limit;
 }
 
-/**
- * pipe_space_for_user - Return number of slots available to userspace
- * @head: The pipe ring head pointer
- * @tail: The pipe ring tail pointer
- * @pipe: The pipe info structure
- */
-static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail,
-					       struct pipe_inode_info *pipe)
-{
-	unsigned int p_occupancy, p_space;
-
-	p_occupancy = pipe_occupancy(head, tail);
-	if (p_occupancy >= pipe->max_usage)
-		return 0;
-	p_space = pipe->ring_size - p_occupancy;
-	if (p_space > pipe->max_usage)
-		p_space = pipe->max_usage;
-	return p_space;
-}
-
 /**
  * pipe_buf_get - get a reference to a pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 8834f3f61220..12dda1013bea 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -436,18 +436,20 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static inline void data_start(const struct iov_iter *i,
-			      unsigned int *iter_headp, size_t *offp)
+// returns the offset in partial buffer (if any)
+static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages)
 {
+	struct pipe_inode_info *pipe = i->pipe;
+	int used = pipe->head - pipe->tail;
 	int off = i->last_offset;
 
+	*npages = max((int)pipe->max_usage - used, 0);
+
 	if (off > 0 && off < PAGE_SIZE) { // anon and not full
-		*iter_headp = i->pipe->head - 1;
-		*offp = off;
-	} else {
-		*iter_headp = i->pipe->head;
-		*offp = 0;
+		(*npages)++;
+		return off;
 	}
+	return 0;
 }
 
 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
@@ -1318,18 +1320,16 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
-	unsigned int iter_head, npages;
+	unsigned int npages, off;
 	size_t capacity;
 
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &iter_head, start);
-	/* Amount of free space: some of this one + all after this one */
-	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
-	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
+	*start = off = pipe_npages(i, &npages);
+	capacity = min(npages, maxpages) * PAGE_SIZE - off;
 
-	return __pipe_get_pages(i, min(maxsize, capacity), pages, *start);
+	return __pipe_get_pages(i, min(maxsize, capacity), pages, off);
 }
 
 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
@@ -1494,24 +1494,22 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   size_t *start)
 {
 	struct page **p;
-	unsigned int iter_head, npages;
+	unsigned int npages, off;
 	ssize_t n;
 
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &iter_head, start);
-	/* Amount of free space: some of this one + all after this one */
-	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
-	n = npages * PAGE_SIZE - *start;
+	*start = off = pipe_npages(i, &npages);
+	n = npages * PAGE_SIZE - off;
 	if (maxsize > n)
 		maxsize = n;
 	else
-		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
+		npages = DIV_ROUND_UP(maxsize + off, PAGE_SIZE);
 	p = get_pages_array(npages);
 	if (!p)
 		return -ENOMEM;
-	n = __pipe_get_pages(i, maxsize, p, *start);
+	n = __pipe_get_pages(i, maxsize, p, off);
 	if (n > 0)
 		*pages = p;
 	else
@@ -1739,16 +1737,12 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	if (iov_iter_is_bvec(i))
 		return bvec_npages(i, maxpages);
 	if (iov_iter_is_pipe(i)) {
-		unsigned int iter_head;
 		int npages;
-		size_t off;
 
 		if (!sanity(i))
 			return 0;
 
-		data_start(i, &iter_head, &off);
-		/* some of this one + all after this one */
-		npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
+		pipe_npages(i, &npages);
 		return min(npages, maxpages);
 	}
 	if (iov_iter_is_xarray(i)) {
-- 
cgit v1.2.3


From 1ef255e257173f4bc44317ef2076e7e0de688fdf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jun 2022 10:28:36 -0400
Subject: iov_iter: advancing variants of iov_iter_get_pages{,_alloc}()

Most of the users immediately follow successful iov_iter_get_pages()
with advancing by the amount it had returned.

Provide inline wrappers doing that, convert trivial open-coded
uses of those.

BTW, iov_iter_get_pages() never returns more than it had been asked
to; such checks in cifs ought to be removed someday...

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/scsi.c |  4 +---
 fs/ceph/file.c       |  3 +--
 fs/cifs/file.c       |  6 ++----
 fs/cifs/misc.c       |  3 +--
 fs/direct-io.c       |  3 +--
 fs/fuse/dev.c        |  3 +--
 fs/fuse/file.c       |  3 +--
 fs/nfs/direct.c      |  6 ++----
 include/linux/uio.h  | 20 ++++++++++++++++++++
 net/core/datagram.c  |  3 +--
 net/core/skmsg.c     |  3 +--
 net/rds/message.c    |  3 +--
 net/tls/tls_sw.c     |  4 +---
 13 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index ffd9e6c2ffc1..9b65509424dc 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -643,14 +643,12 @@ vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd,
 	size_t offset;
 	unsigned int npages = 0;
 
-	bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
+	bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
 				VHOST_SCSI_PREALLOC_UPAGES, &offset);
 	/* No pages were pinned */
 	if (bytes <= 0)
 		return bytes < 0 ? bytes : -EFAULT;
 
-	iov_iter_advance(iter, bytes);
-
 	while (bytes) {
 		unsigned n = min_t(unsigned, PAGE_SIZE - offset, bytes);
 		sg_set_page(sg++, pages[npages++], n, offset);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c535de5852bf..8fab5db16c73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
 		size_t start;
 		int idx = 0;
 
-		bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+		bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
 					   ITER_GET_BVECS_PAGES, &start);
 		if (bytes < 0)
 			return size ?: bytes;
 
-		iov_iter_advance(iter, bytes);
 		size += bytes;
 
 		for ( ; bytes; idx++, bvec_idx++) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e1e05b253daa..3ba013e2987f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3022,7 +3022,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 		if (ctx->direct_io) {
 			ssize_t result;
 
-			result = iov_iter_get_pages_alloc(
+			result = iov_iter_get_pages_alloc2(
 				from, &pagevec, cur_len, &start);
 			if (result < 0) {
 				cifs_dbg(VFS,
@@ -3036,7 +3036,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
 				break;
 			}
 			cur_len = (size_t)result;
-			iov_iter_advance(from, cur_len);
 
 			nr_pages =
 				(cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -3758,7 +3757,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		if (ctx->direct_io) {
 			ssize_t result;
 
-			result = iov_iter_get_pages_alloc(
+			result = iov_iter_get_pages_alloc2(
 					&direct_iov, &pagevec,
 					cur_len, &start);
 			if (result < 0) {
@@ -3774,7 +3773,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 				break;
 			}
 			cur_len = (size_t)result;
-			iov_iter_advance(&direct_iov, cur_len);
 
 			rdata = cifs_readdata_direct_alloc(
 					pagevec, cifs_uncached_readv_complete);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0e84e6fcf8ab..f833953bab61 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1029,7 +1029,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
 	saved_len = count;
 
 	while (count && npages < max_pages) {
-		rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+		rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start);
 		if (rc < 0) {
 			cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc);
 			break;
@@ -1041,7 +1041,6 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
 			break;
 		}
 
-		iov_iter_advance(iter, rc);
 		count -= rc;
 		rc += start;
 		cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c7fc01c2d509..f669163d5860 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -169,7 +169,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
+	ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
 				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
@@ -191,7 +191,6 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 	}
 
 	if (ret >= 0) {
-		iov_iter_advance(sdio->iter, ret);
 		ret += sdio->from;
 		sdio->head = 0;
 		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8d657c2cd6f7..51897427a534 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -730,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		}
 	} else {
 		size_t off;
-		err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+		err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
 		if (err < 0)
 			return err;
 		BUG_ON(!err);
 		cs->len = err;
 		cs->offset = off;
 		cs->pg = page;
-		iov_iter_advance(cs->iter, err);
 	}
 
 	return lock_request(cs->req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c982e3afe3b4..69e19fc0afc1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1401,14 +1401,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+		ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
 					*nbytesp - nbytes,
 					max_pages - ap->num_pages,
 					&start);
 		if (ret < 0)
 			break;
 
-		iov_iter_advance(ii, ret);
 		nbytes += ret;
 
 		ret += start;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 022e1ce63e62..c275c83f0aef 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -364,13 +364,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		size_t pgbase;
 		unsigned npages, i;
 
-		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+		result = iov_iter_get_pages_alloc2(iter, &pagevec,
 						  rsize, &pgbase);
 		if (result < 0)
 			break;
 	
 		bytes = result;
-		iov_iter_advance(iter, bytes);
 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
@@ -812,13 +811,12 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 		size_t pgbase;
 		unsigned npages, i;
 
-		result = iov_iter_get_pages_alloc(iter, &pagevec, 
+		result = iov_iter_get_pages_alloc2(iter, &pagevec,
 						  wsize, &pgbase);
 		if (result < 0)
 			break;
 
 		bytes = result;
-		iov_iter_advance(iter, bytes);
 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
 		for (i = 0; i < npages; i++) {
 			struct nfs_page *req;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e7fc29b5ad19..b70d28693400 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -351,4 +351,24 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	};
 }
 
+static inline ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
+			size_t maxsize, unsigned maxpages, size_t *start)
+{
+	ssize_t res = iov_iter_get_pages(i, pages, maxsize, maxpages, start);
+
+	if (res >= 0)
+		iov_iter_advance(i, res);
+	return res;
+}
+
+static inline ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
+			size_t maxsize, size_t *start)
+{
+	ssize_t res = iov_iter_get_pages_alloc(i, pages, maxsize, start);
+
+	if (res >= 0)
+		iov_iter_advance(i, res);
+	return res;
+}
+
 #endif
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f3988ef8e9af..7255531f63ae 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -632,12 +632,11 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 		if (frag == MAX_SKB_FRAGS)
 			return -EMSGSIZE;
 
-		copied = iov_iter_get_pages(from, pages, length,
+		copied = iov_iter_get_pages2(from, pages, length,
 					    MAX_SKB_FRAGS - frag, &start);
 		if (copied < 0)
 			return -EFAULT;
 
-		iov_iter_advance(from, copied);
 		length -= copied;
 
 		truesize = PAGE_ALIGN(copied + start);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 81627892bdd4..cf3c24c8610d 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -324,14 +324,13 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			goto out;
 		}
 
-		copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+		copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
 					    &offset);
 		if (copied <= 0) {
 			ret = -EFAULT;
 			goto out;
 		}
 
-		iov_iter_advance(from, copied);
 		bytes -= copied;
 		msg->sg.size += copied;
 
diff --git a/net/rds/message.c b/net/rds/message.c
index 799034e0f513..d74be4e3f3fa 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -391,7 +391,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
 		size_t start;
 		ssize_t copied;
 
-		copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+		copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE,
 					    1, &start);
 		if (copied < 0) {
 			struct mmpin *mmp;
@@ -405,7 +405,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
 			goto err;
 		}
 		total_copied += copied;
-		iov_iter_advance(from, copied);
 		length -= copied;
 		sg_set_page(sg, pages, copied, start);
 		rm->data.op_nents++;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 17db8c8811fa..f76119f62f1b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1352,7 +1352,7 @@ static int tls_setup_from_iter(struct iov_iter *from,
 			rc = -EFAULT;
 			goto out;
 		}
-		copied = iov_iter_get_pages(from, pages,
+		copied = iov_iter_get_pages2(from, pages,
 					    length,
 					    maxpages, &offset);
 		if (copied <= 0) {
@@ -1360,8 +1360,6 @@ static int tls_setup_from_iter(struct iov_iter *from,
 			goto out;
 		}
 
-		iov_iter_advance(from, copied);
-
 		length -= copied;
 		size += copied;
 		while (copied) {
-- 
cgit v1.2.3


From eba2d3d798295dc43cae8fade102f9d083a2a741 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 10 Jun 2022 13:05:12 -0400
Subject: get rid of non-advancing variants

mechanical change; will be further massaged in subsequent commits

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 24 ++----------------------
 lib/iov_iter.c      | 27 ++++++++++++++++++---------
 2 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index b70d28693400..5896af36199c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -247,9 +247,9 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
-ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
+ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
+ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
@@ -351,24 +351,4 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
 	};
 }
 
-static inline ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
-			size_t maxsize, unsigned maxpages, size_t *start)
-{
-	ssize_t res = iov_iter_get_pages(i, pages, maxsize, maxpages, start);
-
-	if (res >= 0)
-		iov_iter_advance(i, res);
-	return res;
-}
-
-static inline ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
-			size_t maxsize, size_t *start)
-{
-	ssize_t res = iov_iter_get_pages_alloc(i, pages, maxsize, start);
-
-	if (res >= 0)
-		iov_iter_advance(i, res);
-	return res;
-}
-
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f003a20d8683..c48c83602aae 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1328,6 +1328,7 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 		left -= PAGE_SIZE - off;
 		if (left <= 0) {
 			buf->len += maxsize;
+			iov_iter_advance(i, maxsize);
 			return maxsize;
 		}
 		buf->len = PAGE_SIZE;
@@ -1347,7 +1348,9 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 	}
 	if (!npages)
 		return -EFAULT;
-	return maxsize - left;
+	maxsize -= left;
+	iov_iter_advance(i, maxsize);
+	return maxsize;
 }
 
 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
@@ -1397,7 +1400,9 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
 	if (nr == 0)
 		return 0;
 
-	return min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
+	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
+	iov_iter_advance(i, maxsize);
+	return maxsize;
 }
 
 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
@@ -1469,7 +1474,9 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 		res = get_user_pages_fast(addr, n, gup_flags, *pages);
 		if (unlikely(res <= 0))
 			return res;
-		return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
+		maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
+		iov_iter_advance(i, maxsize);
+		return maxsize;
 	}
 	if (iov_iter_is_bvec(i)) {
 		struct page **p;
@@ -1481,8 +1488,10 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 			return -ENOMEM;
 		p = *pages;
 		for (int k = 0; k < n; k++)
-			get_page(*p++ = page++);
-		return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
+			get_page(p[k] = page + k);
+		maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
+		iov_iter_advance(i, maxsize);
+		return maxsize;
 	}
 	if (iov_iter_is_pipe(i))
 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
@@ -1491,7 +1500,7 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 	return -EFAULT;
 }
 
-ssize_t iov_iter_get_pages(struct iov_iter *i,
+ssize_t iov_iter_get_pages2(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
@@ -1501,9 +1510,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 
 	return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
 }
-EXPORT_SYMBOL(iov_iter_get_pages);
+EXPORT_SYMBOL(iov_iter_get_pages2);
 
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
 {
@@ -1518,7 +1527,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	}
 	return len;
 }
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
 
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 			       struct iov_iter *i)
-- 
cgit v1.2.3


From 46dae32fe625a75f549c3a70edc77b778197bb05 Mon Sep 17 00:00:00 2001
From: Youngmin Nam <youngmin.nam@samsung.com>
Date: Tue, 12 Jul 2022 18:47:15 +0900
Subject: time: Correct the prototype of ns_to_kernel_old_timeval and
 ns_to_timespec64

In ns_to_kernel_old_timeval() definition, the function argument is defined
with const identifier in kernel/time/time.c, but the prototype in
include/linux/time32.h looks different.

- The function is defined in kernel/time/time.c as below:
  struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)

- The function is decalared in include/linux/time32.h as below:
  extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec);

Because the variable of arithmethic types isn't modified in the calling scope,
there's no need to mark arguments as const, which was already mentioned during
review (Link[1) of the original patch.

Likewise remove the "const" keyword in both definition and declaration of
ns_to_timespec64() as requested by Arnd (Link[2]).

Fixes: a84d1169164b ("y2038: Introduce struct __kernel_old_timeval")
Signed-off-by: Youngmin Nam <youngmin.nam@samsung.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/all/20220712094715.2918823-1-youngmin.nam@samsung.com
Link[1]: https://lore.kernel.org/all/20180310081123.thin6wphgk7tongy@gmail.com/
Link[2]: https://lore.kernel.org/all/CAK8P3a3nknJgEDESGdJH91jMj6R_xydFqWASd8r5BbesdvMBgA@mail.gmail.com/
---
 include/linux/time64.h | 2 +-
 kernel/time/time.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 2fb8232cff1d..f1bcea8c124a 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -145,7 +145,7 @@ static inline s64 timespec64_to_ns(const struct timespec64 *ts)
  *
  * Returns the timespec64 representation of the nsec parameter.
  */
-extern struct timespec64 ns_to_timespec64(const s64 nsec);
+extern struct timespec64 ns_to_timespec64(s64 nsec);
 
 /**
  * timespec64_add_ns - Adds nanoseconds to a timespec64
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 29923b20e0e4..526257b3727c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
-struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
+struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
 {
 	struct timespec64 ts = ns_to_timespec64(nsec);
 	struct __kernel_old_timeval tv;
@@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64);
  *
  * Returns the timespec64 representation of the nsec parameter.
  */
-struct timespec64 ns_to_timespec64(const s64 nsec)
+struct timespec64 ns_to_timespec64(s64 nsec)
 {
 	struct timespec64 ts = { 0, 0 };
 	s32 rem;
-- 
cgit v1.2.3


From af887e437bb298752b2edc5834048b8151b8aea0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 9 Aug 2022 12:50:28 -0400
Subject: NFS: Improve write error tracing

Don't leak request pointers, but use the "device:inode" labelling that
is used by all the other trace points. Furthermore, replace use of page
indexes with an offset, again in order to align behaviour with other
NFS trace points.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfstrace.h        | 36 +++++++++++++++++++++---------------
 fs/nfs/write.c           |  8 +++++---
 include/linux/nfs_page.h |  3 +--
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8bd0c13a7c4b..731eecfdf49a 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1447,44 +1447,50 @@ TRACE_EVENT(nfs_writeback_done,
 
 DECLARE_EVENT_CLASS(nfs_page_error_class,
 		TP_PROTO(
+			const struct inode *inode,
 			const struct nfs_page *req,
 			int error
 		),
 
-		TP_ARGS(req, error),
+		TP_ARGS(inode, req, error),
 
 		TP_STRUCT__entry(
-			__field(const void *, req)
-			__field(pgoff_t, index)
-			__field(unsigned int, offset)
-			__field(unsigned int, pgbase)
-			__field(unsigned int, bytes)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+			__field(unsigned int, count)
 			__field(int, error)
 		),
 
 		TP_fast_assign(
-			__entry->req = req;
-			__entry->index = req->wb_index;
-			__entry->offset = req->wb_offset;
-			__entry->pgbase = req->wb_pgbase;
-			__entry->bytes = req->wb_bytes;
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->offset = req_offset(req);
+			__entry->count = req->wb_bytes;
 			__entry->error = error;
 		),
 
 		TP_printk(
-			"req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d",
-			__entry->req, __entry->index, __entry->offset,
-			__entry->pgbase, __entry->bytes, __entry->error
+			"error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld count=%u", __entry->error,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle, __entry->offset,
+			__entry->count
 		)
 );
 
 #define DEFINE_NFS_PAGEERR_EVENT(name) \
 	DEFINE_EVENT(nfs_page_error_class, name, \
 			TP_PROTO( \
+				const struct inode *inode, \
 				const struct nfs_page *req, \
 				int error \
 			), \
-			TP_ARGS(req, error))
+			TP_ARGS(inode, req, error))
 
 DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
 DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4adf2b488da1..4a3796811b4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page)
 
 static void nfs_write_error(struct nfs_page *req, int error)
 {
-	trace_nfs_write_error(req, error);
+	trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req,
+			      error);
 	nfs_mapping_set_error(req->wb_page, error);
 	nfs_inode_remove_request(req);
 	nfs_end_page_writeback(req);
@@ -1000,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
 		    (hdr->good_bytes < bytes)) {
-			trace_nfs_comp_error(req, hdr->error);
+			trace_nfs_comp_error(hdr->inode, req, hdr->error);
 			nfs_mapping_set_error(req->wb_page, hdr->error);
 			goto remove_req;
 		}
@@ -1882,7 +1883,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 			(long long)req_offset(req));
 		if (status < 0) {
 			if (req->wb_page) {
-				trace_nfs_commit_error(req, status);
+				trace_nfs_commit_error(data->inode, req,
+						       status);
 				nfs_mapping_set_error(req->wb_page, status);
 				nfs_inode_remove_request(req);
 			}
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index f0373a6cb5fb..ba7e2e4b0926 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -202,8 +202,7 @@ nfs_list_entry(struct list_head *head)
 	return list_entry(head, struct nfs_page, wb_list);
 }
 
-static inline
-loff_t req_offset(struct nfs_page *req)
+static inline loff_t req_offset(const struct nfs_page *req)
 {
 	return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset;
 }
-- 
cgit v1.2.3


From d4252071b97d2027d246f6a82cbee4d52f618b47 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Aug 2022 14:32:13 -0400
Subject: add barriers to buffer_uptodate and set_buffer_uptodate

Let's have a look at this piece of code in __bread_slow:

	get_bh(bh);
	bh->b_end_io = end_buffer_read_sync;
	submit_bh(REQ_OP_READ, 0, bh);
	wait_on_buffer(bh);
	if (buffer_uptodate(bh))
		return bh;

Neither wait_on_buffer nor buffer_uptodate contain any memory barrier.
Consequently, if someone calls sb_bread and then reads the buffer data,
the read of buffer data may be executed before wait_on_buffer(bh) on
architectures with weak memory ordering and it may return invalid data.

Fix this bug by adding a memory barrier to set_buffer_uptodate and an
acquire barrier to buffer_uptodate (in a similar way as
folio_test_uptodate and folio_mark_uptodate).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 307445d1c69e..def8b8d30ccc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -118,7 +118,6 @@ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
  * of the form "mark_buffer_foo()".  These are higher-level functions which
  * do something in addition to setting a b_state bit.
  */
-BUFFER_FNS(Uptodate, uptodate)
 BUFFER_FNS(Dirty, dirty)
 TAS_BUFFER_FNS(Dirty, dirty)
 BUFFER_FNS(Lock, locked)
@@ -136,6 +135,30 @@ BUFFER_FNS(Meta, meta)
 BUFFER_FNS(Prio, prio)
 BUFFER_FNS(Defer_Completion, defer_completion)
 
+static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
+{
+	/*
+	 * make it consistent with folio_mark_uptodate
+	 * pairs with smp_load_acquire in buffer_uptodate
+	 */
+	smp_mb__before_atomic();
+	set_bit(BH_Uptodate, &bh->b_state);
+}
+
+static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
+{
+	clear_bit(BH_Uptodate, &bh->b_state);
+}
+
+static __always_inline int buffer_uptodate(const struct buffer_head *bh)
+{
+	/*
+	 * make it consistent with folio_test_uptodate
+	 * pairs with smp_mb__before_atomic in set_buffer_uptodate
+	 */
+	return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+}
+
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
 /* If we *know* page->private refers to buffer_heads */
-- 
cgit v1.2.3


From 2a0133723f9ebeb751cfce19f74ec07e108bef1f Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Fri, 5 Aug 2022 15:48:34 +0800
Subject: net: fix refcount bug in sk_psock_get (2)

Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
 <TASK>
 __refcount_add_not_zero include/linux/refcount.h:163 [inline]
 __refcount_inc_not_zero include/linux/refcount.h:227 [inline]
 refcount_inc_not_zero include/linux/refcount.h:245 [inline]
 sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
 sk_backlog_rcv include/net/sock.h:1061 [inline]
 __release_sock+0x134/0x3b0 net/core/sock.c:2849
 release_sock+0x54/0x1b0 net/core/sock.c:3404
 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
 __sys_shutdown_sock net/socket.c:2331 [inline]
 __sys_shutdown_sock net/socket.c:2325 [inline]
 __sys_shutdown+0xf1/0x1b0 net/socket.c:2343
 __do_sys_shutdown net/socket.c:2351 [inline]
 __se_sys_shutdown net/socket.c:2349 [inline]
 __x64_sys_shutdown+0x50/0x70 net/socket.c:2349
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
 </TASK>

During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().

Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as psock type, triggering the refcnt warning.

So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.

This patch solves it by using another bit(defined as
SK_USER_DATA_PSOCK) in PTRMASK, to mark whether
sk_user_data points to a psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").

For there will possibly be more flags in the sk_user_data field,
this patch also refactor sk_user_data flags code to be more generic
to improve its maintainability.

Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skmsg.h |  3 ++-
 include/net/sock.h    | 68 ++++++++++++++++++++++++++++++++++++---------------
 net/core/skmsg.c      |  4 ++-
 3 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 153b6dec9b6a..48f4b645193b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -278,7 +278,8 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
 
 static inline struct sk_psock *sk_psock(const struct sock *sk)
 {
-	return rcu_dereference_sk_user_data(sk);
+	return __rcu_dereference_sk_user_data_with_flags(sk,
+							 SK_USER_DATA_PSOCK);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index a7273b289188..05a1bbdf5805 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -545,14 +545,26 @@ enum sk_pacing {
 	SK_PACING_FQ		= 2,
 };
 
-/* Pointer stored in sk_user_data might not be suitable for copying
- * when cloning the socket. For instance, it can point to a reference
- * counted object. sk_user_data bottom bit is set if pointer must not
- * be copied.
+/* flag bits in sk_user_data
+ *
+ * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
+ *   not be suitable for copying when cloning the socket. For instance,
+ *   it can point to a reference counted object. sk_user_data bottom
+ *   bit is set if pointer must not be copied.
+ *
+ * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
+ *   managed/owned by a BPF reuseport array. This bit should be set
+ *   when sk_user_data's sk is added to the bpf's reuseport_array.
+ *
+ * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
+ *   sk_user_data points to psock type. This bit should be set
+ *   when sk_user_data is assigned to a psock object.
  */
 #define SK_USER_DATA_NOCOPY	1UL
-#define SK_USER_DATA_BPF	2UL	/* Managed by BPF */
-#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
+#define SK_USER_DATA_BPF	2UL
+#define SK_USER_DATA_PSOCK	4UL
+#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+				  SK_USER_DATA_PSOCK)
 
 /**
  * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
@@ -565,24 +577,40 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 
 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
 
+/**
+ * __rcu_dereference_sk_user_data_with_flags - return the pointer
+ * only if argument flags all has been set in sk_user_data. Otherwise
+ * return NULL
+ *
+ * @sk: socket
+ * @flags: flag bits
+ */
+static inline void *
+__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
+					  uintptr_t flags)
+{
+	uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));
+
+	WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);
+
+	if ((sk_user_data & flags) == flags)
+		return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
+	return NULL;
+}
+
 #define rcu_dereference_sk_user_data(sk)				\
+	__rcu_dereference_sk_user_data_with_flags(sk, 0)
+#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)		\
 ({									\
-	void *__tmp = rcu_dereference(__sk_user_data((sk)));		\
-	(void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK);		\
-})
-#define rcu_assign_sk_user_data(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
-	rcu_assign_pointer(__sk_user_data((sk)), __tmp);		\
-})
-#define rcu_assign_sk_user_data_nocopy(sk, ptr)				\
-({									\
-	uintptr_t __tmp = (uintptr_t)(ptr);				\
-	WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK);			\
+	uintptr_t __tmp1 = (uintptr_t)(ptr),				\
+		  __tmp2 = (uintptr_t)(flags);				\
+	WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);			\
+	WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);			\
 	rcu_assign_pointer(__sk_user_data((sk)),			\
-			   __tmp | SK_USER_DATA_NOCOPY);		\
+			   __tmp1 | __tmp2);				\
 })
+#define rcu_assign_sk_user_data(sk, ptr)				\
+	__rcu_assign_sk_user_data_with_flags(sk, ptr, 0)
 
 static inline
 struct net *sock_net(const struct sock *sk)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 81627892bdd4..57e942a6431a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -739,7 +739,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
 	refcount_set(&psock->refcnt, 1);
 
-	rcu_assign_sk_user_data_nocopy(sk, psock);
+	__rcu_assign_sk_user_data_with_flags(sk, psock,
+					     SK_USER_DATA_NOCOPY |
+					     SK_USER_DATA_PSOCK);
 	sock_hold(sk);
 
 out:
-- 
cgit v1.2.3


From c2a052a4a949df53f50a5024843432d2234cb824 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Fri, 24 Jun 2022 10:55:41 +0800
Subject: remoteproc: rename len of rpoc_vring to num

Rename the member len in the structure rpoc_vring to num. And remove 'in
bytes' from the comment of it. This is misleading. Because this actually
refers to the size of the virtio vring to be created. The unit is not
bytes.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Message-Id: <20220624025621.128843-2-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/remoteproc/remoteproc_core.c   |  4 ++--
 drivers/remoteproc/remoteproc_virtio.c | 10 +++++-----
 include/linux/remoteproc.h             |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 02a04ab34a23..2d2f3bab5888 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -334,7 +334,7 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
 	size_t size;
 
 	/* actual size of vring (in bytes) */
-	size = PAGE_ALIGN(vring_size(rvring->len, rvring->align));
+	size = PAGE_ALIGN(vring_size(rvring->num, rvring->align));
 
 	rsc = (void *)rproc->table_ptr + rvdev->rsc_offset;
 
@@ -401,7 +401,7 @@ rproc_parse_vring(struct rproc_vdev *rvdev, struct fw_rsc_vdev *rsc, int i)
 		return -EINVAL;
 	}
 
-	rvring->len = vring->num;
+	rvring->num = vring->num;
 	rvring->align = vring->align;
 	rvring->rvdev = rvdev;
 
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 70ab496d0431..d43d74733f0a 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -87,7 +87,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	struct fw_rsc_vdev *rsc;
 	struct virtqueue *vq;
 	void *addr;
-	int len, size;
+	int num, size;
 
 	/* we're temporarily limited to two virtqueues per rvdev */
 	if (id >= ARRAY_SIZE(rvdev->vring))
@@ -104,20 +104,20 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 
 	rvring = &rvdev->vring[id];
 	addr = mem->va;
-	len = rvring->len;
+	num = rvring->num;
 
 	/* zero vring */
-	size = vring_size(len, rvring->align);
+	size = vring_size(num, rvring->align);
 	memset(addr, 0, size);
 
 	dev_dbg(dev, "vring%d: va %pK qsz %d notifyid %d\n",
-		id, addr, len, rvring->notifyid);
+		id, addr, num, rvring->notifyid);
 
 	/*
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real device.
 	 */
-	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx,
+	vq = vring_new_virtqueue(id, num, rvring->align, vdev, false, ctx,
 				 addr, rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 7c943f0a2fc4..aea79c77db0f 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -597,7 +597,7 @@ struct rproc_subdev {
 /**
  * struct rproc_vring - remoteproc vring state
  * @va:	virtual address
- * @len: length, in bytes
+ * @num: vring size
  * @da: device address
  * @align: vring alignment
  * @notifyid: rproc-specific unique vring index
@@ -606,7 +606,7 @@ struct rproc_subdev {
  */
 struct rproc_vring {
 	void *va;
-	int len;
+	int num;
 	u32 da;
 	u32 align;
 	int notifyid;
-- 
cgit v1.2.3


From da802961832f9852886304290135457519815497 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:21 +0800
Subject: virtio: record the maximum queue num supported by the device.

virtio-net can display the maximum (supported by hardware) ring size in
ethtool -g eth0.

When the subsequent patch implements vring reset, it can judge whether
the ring size passed by the driver is legal based on this.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-2-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             | 1 +
 drivers/platform/mellanox/mlxbf-tmfifo.c | 2 ++
 drivers/remoteproc/remoteproc_virtio.c   | 2 ++
 drivers/s390/virtio/virtio_ccw.c         | 3 +++
 drivers/virtio/virtio_mmio.c             | 2 ++
 drivers/virtio/virtio_pci_legacy.c       | 2 ++
 drivers/virtio/virtio_pci_modern.c       | 2 ++
 drivers/virtio/virtio_vdpa.c             | 2 ++
 include/linux/virtio.h                   | 2 ++
 9 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 82ff3785bf69..e719af8bdf56 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -958,6 +958,7 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
 		goto error_create;
 	}
 	vq->priv = info;
+	vq->num_max = num;
 	num = virtqueue_get_vring_size(vq);
 
 	if (vu_dev->protocol_features &
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 38800e86ed8a..1ae3c56b66b0 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -959,6 +959,8 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 			goto error;
 		}
 
+		vq->num_max = vring->num;
+
 		vqs[i] = vq;
 		vring->vq = vq;
 		vq->priv = vring;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index d43d74733f0a..0f7706e23eb9 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -125,6 +125,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	vq->num_max = num;
+
 	rvring->vq = vq;
 	vq->priv = rvring;
 
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 161d3b141f0d..6b86d0280d6b 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -530,6 +530,9 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 		err = -ENOMEM;
 		goto out_err;
 	}
+
+	vq->num_max = info->num;
+
 	/* it may have been reduced */
 	info->num = virtqueue_get_vring_size(vq);
 
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 945cb8fb60b6..3ff746e3f24a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -403,6 +403,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 		goto error_new_virtqueue;
 	}
 
+	vq->num_max = num;
+
 	/* Activate the queue */
 	writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
 	if (vm_dev->version == 1) {
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index a5e5721145c7..2257f1b3d8ae 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -135,6 +135,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
+	vq->num_max = num;
+
 	q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
 	if (q_pfn >> 32) {
 		dev_err(&vp_dev->pci_dev->dev,
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 623906b4996c..e7e0b8c850f6 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -218,6 +218,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
+	vq->num_max = num;
+
 	/* activate the queue */
 	vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq));
 	vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq),
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index c40f7deb6b5a..9670cc79371d 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -183,6 +183,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 		goto error_new_virtqueue;
 	}
 
+	vq->num_max = max_num;
+
 	/* Setup virtqueue callback */
 	cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
 	cb.private = info;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d8fdf170637c..129bde7521e3 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -19,6 +19,7 @@
  * @priv: a pointer for the virtqueue implementation to use.
  * @index: the zero-based ordinal number for this queue.
  * @num_free: number of elements we expect to be able to fit.
+ * @num_max: the maximum number of elements supported by the device.
  *
  * A note on @num_free: with indirect buffers, each buffer needs one
  * element in the queue, otherwise a buffer will need one element per
@@ -31,6 +32,7 @@ struct virtqueue {
 	struct virtio_device *vdev;
 	unsigned int index;
 	unsigned int num_free;
+	unsigned int num_max;
 	void *priv;
 };
 
-- 
cgit v1.2.3


From 3086e9fc9173166774652a488467e4176ee1c81b Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:22 +0800
Subject: virtio: struct virtio_config_ops add callbacks for queue_reset

reset can be divided into the following four steps (example):
 1. transport: notify the device to reset the queue
 2. vring:     recycle the buffer submitted
 3. vring:     reset/resize the vring (may re-alloc)
 4. transport: mmap vring to device, and enable the queue

In order to support queue reset, add two callbacks in struct
virtio_config_ops to implement steps 1 and 4.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-3-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index b47c2e7ed0ee..36ec7be1f480 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -78,6 +78,18 @@ struct virtio_shm_region {
  * @set_vq_affinity: set the affinity for a virtqueue (optional).
  * @get_vq_affinity: get the affinity for a virtqueue (optional).
  * @get_shm_region: get a shared memory region based on the index.
+ * @disable_vq_and_reset: reset a queue individually (optional).
+ *	vq: the virtqueue
+ *	Returns 0 on success or error status
+ *	disable_vq_and_reset will guarantee that the callbacks are disabled and
+ *	synchronized.
+ *	Except for the callback, the caller should guarantee that the vring is
+ *	not accessed by any functions of virtqueue.
+ * @enable_vq_after_reset: enable a reset queue
+ *	vq: the virtqueue
+ *	Returns 0 on success or error status
+ *	If disable_vq_and_reset is set, then enable_vq_after_reset must also be
+ *	set.
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -104,6 +116,8 @@ struct virtio_config_ops {
 			int index);
 	bool (*get_shm_region)(struct virtio_device *vdev,
 			       struct virtio_shm_region *region, u8 id);
+	int (*disable_vq_and_reset)(struct virtqueue *vq);
+	int (*enable_vq_after_reset)(struct virtqueue *vq);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
-- 
cgit v1.2.3


From 07d9629d49584b6f79faa6158cd7aef7e6919703 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:27 +0800
Subject: virtio_ring: split: stop __vring_new_virtqueue as export symbol

There is currently only one place to reference __vring_new_virtqueue()
directly from the outside of virtio core. And here vring_new_virtqueue()
can be used instead.

Subsequent patches will modify __vring_new_virtqueue, so stop it as an
export symbol for now.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-8-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 25 ++++++++++++++++---------
 include/linux/virtio_ring.h  | 10 ----------
 tools/virtio/virtio_test.c   |  4 ++--
 3 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a63ef2d99955..8ce6cc73d814 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -204,6 +204,14 @@ struct vring_virtqueue {
 #endif
 };
 
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+					       struct vring vring,
+					       struct virtio_device *vdev,
+					       bool weak_barriers,
+					       bool context,
+					       bool (*notify)(struct virtqueue *),
+					       void (*callback)(struct virtqueue *),
+					       const char *name);
 
 /*
  * Helpers.
@@ -2195,14 +2203,14 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
 /* Only available for split ring */
-struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
-					struct virtio_device *vdev,
-					bool weak_barriers,
-					bool context,
-					bool (*notify)(struct virtqueue *),
-					void (*callback)(struct virtqueue *),
-					const char *name)
+static struct virtqueue *__vring_new_virtqueue(unsigned int index,
+					       struct vring vring,
+					       struct virtio_device *vdev,
+					       bool weak_barriers,
+					       bool context,
+					       bool (*notify)(struct virtqueue *),
+					       void (*callback)(struct virtqueue *),
+					       const char *name)
 {
 	struct vring_virtqueue *vq;
 
@@ -2277,7 +2285,6 @@ err_state:
 	kfree(vq);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
 
 struct virtqueue *vring_create_virtqueue(
 	unsigned int index,
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index b485b13fa50b..8b8af1a38991 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -76,16 +76,6 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 					 void (*callback)(struct virtqueue *vq),
 					 const char *name);
 
-/* Creates a virtqueue with a custom layout. */
-struct virtqueue *__vring_new_virtqueue(unsigned int index,
-					struct vring vring,
-					struct virtio_device *vdev,
-					bool weak_barriers,
-					bool ctx,
-					bool (*notify)(struct virtqueue *),
-					void (*callback)(struct virtqueue *),
-					const char *name);
-
 /*
  * Creates a virtqueue with a standard layout but a caller-allocated
  * ring.
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index 23f142af544a..86a410ddcedd 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -102,8 +102,8 @@ static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev)
 
 	memset(info->ring, 0, vring_size(num, 4096));
 	vring_init(&info->vring, num, info->ring, 4096);
-	info->vq = __vring_new_virtqueue(info->idx, info->vring, vdev, true,
-					 false, vq_notify, vq_callback, "test");
+	info->vq = vring_new_virtqueue(info->idx, num, 4096, vdev, true, false,
+				       info->ring, vq_notify, vq_callback, "test");
 	assert(info->vq);
 	info->vq->priv = info;
 }
-- 
cgit v1.2.3


From c790e8e1817f1a17c05e64f1c4f16f231b8529d5 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:44 +0800
Subject: virtio_ring: introduce virtqueue_resize()

Introduce virtqueue_resize() to implement the resize of vring.
Based on these, the driver can dynamically adjust the size of the vring.
For example: ethtool -G.

virtqueue_resize() implements resize based on the vq reset function. In
case of failure to allocate a new vring, it will give up resize and use
the original vring.

During this process, if the re-enable reset vq fails, the vq can no
longer be used. Although the probability of this situation is not high.

The parameter recycle is used to recycle the buffer that is no longer
used.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-25-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       |  3 ++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index bea5a3448217..6447a09e2e38 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2539,6 +2539,75 @@ struct virtqueue *vring_create_virtqueue(
 }
 EXPORT_SYMBOL_GPL(vring_create_virtqueue);
 
+/**
+ * virtqueue_resize - resize the vring of vq
+ * @_vq: the struct virtqueue we're talking about.
+ * @num: new ring num
+ * @recycle: callback for recycle the useless buffer
+ *
+ * When it is really necessary to create a new vring, it will set the current vq
+ * into the reset state. Then call the passed callback to recycle the buffer
+ * that is no longer used. Only after the new vring is successfully created, the
+ * old vring will be released.
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error.
+ * 0: success.
+ * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size.
+ *  vq can still work normally
+ * -EBUSY: Failed to sync with device, vq may not work properly
+ * -ENOENT: Transport or device not supported
+ * -E2BIG/-EINVAL: num error
+ * -EPERM: Operation not permitted
+ *
+ */
+int virtqueue_resize(struct virtqueue *_vq, u32 num,
+		     void (*recycle)(struct virtqueue *vq, void *buf))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct virtio_device *vdev = vq->vq.vdev;
+	void *buf;
+	int err;
+
+	if (!vq->we_own_ring)
+		return -EPERM;
+
+	if (num > vq->vq.num_max)
+		return -E2BIG;
+
+	if (!num)
+		return -EINVAL;
+
+	if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num)
+		return 0;
+
+	if (!vdev->config->disable_vq_and_reset)
+		return -ENOENT;
+
+	if (!vdev->config->enable_vq_after_reset)
+		return -ENOENT;
+
+	err = vdev->config->disable_vq_and_reset(_vq);
+	if (err)
+		return err;
+
+	while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
+		recycle(_vq, buf);
+
+	if (vq->packed_ring)
+		err = virtqueue_resize_packed(_vq, num);
+	else
+		err = virtqueue_resize_split(_vq, num);
+
+	if (vdev->config->enable_vq_after_reset(_vq))
+		return -EBUSY;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(virtqueue_resize);
+
 /* Only available for split ring */
 struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 129bde7521e3..62e31bca5602 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -91,6 +91,9 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *vq);
 dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq);
 dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
 
+int virtqueue_resize(struct virtqueue *vq, u32 num,
+		     void (*recycle)(struct virtqueue *vq, void *buf));
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
-- 
cgit v1.2.3


From ea024594b1dc5b6719c1400ae154690f5c203996 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:45 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_notify_data

Add queue_notify_data in struct virtio_pci_common_cfg, which comes from
here https://github.com/oasis-tcs/virtio-spec/issues/89

In order not to affect the API, add a dedicated structure struct
virtio_pci_modern_common_cfg to virtio_pci_modern.h.

Since I want to add queue_reset after queue_notify_data, I submitted
this patch first.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-26-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 7 +++++++
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index eb2bd9b4077d..41f5a018bd94 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -5,6 +5,13 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
+struct virtio_pci_modern_common_cfg {
+	struct virtio_pci_common_cfg cfg;
+
+	__le16 queue_notify_data;	/* read-write */
+	__le16 padding;
+};
+
 struct virtio_pci_modern_device {
 	struct pci_dev *pci_dev;
 
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 3a86f36d7e3d..f5981a874481 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -202,6 +202,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_AVAILHI	44
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
+#define VIRTIO_PCI_COMMON_Q_NDATA	56
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit v1.2.3


From 3251063155032729b8793ac3957136ae25c0bafa Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:46 +0800
Subject: virtio: allow to unbreak/break virtqueue individually

This patch allows the new introduced
__virtqueue_break()/__virtqueue_unbreak() to break/unbreak the
virtqueue.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-27-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 24 ++++++++++++++++++++++++
 include/linux/virtio.h       |  3 +++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6447a09e2e38..accb3ae6cc95 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2724,6 +2724,30 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_break(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+	WRITE_ONCE(vq->broken, true);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_break);
+
+/*
+ * This function should only be called by the core, not directly by the driver.
+ */
+void __virtqueue_unbreak(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
+	WRITE_ONCE(vq->broken, false);
+}
+EXPORT_SYMBOL_GPL(__virtqueue_unbreak);
+
 bool virtqueue_is_broken(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 62e31bca5602..d45ee82a4470 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -138,6 +138,9 @@ bool is_virtio_device(struct device *dev);
 void virtio_break_device(struct virtio_device *dev);
 void __virtio_unbreak_device(struct virtio_device *dev);
 
+void __virtqueue_break(struct virtqueue *_vq);
+void __virtqueue_unbreak(struct virtqueue *_vq);
+
 void virtio_config_changed(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
-- 
cgit v1.2.3


From 4913e85441b40386c4bb093f188b955d8165f1b7 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:48 +0800
Subject: virtio_ring: struct virtqueue introduce reset

Introduce a new member reset to the structure virtqueue to determine
whether the current vq is in the reset state. Subsequent patches will
use it.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-29-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 2 ++
 include/linux/virtio.h       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index accb3ae6cc95..d66c8e6d0ef3 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1996,6 +1996,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
 	vq->vq.index = index;
+	vq->vq.reset = false;
 	vq->we_own_ring = true;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
@@ -2481,6 +2482,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
 	vq->vq.index = index;
+	vq->vq.reset = false;
 	vq->we_own_ring = false;
 	vq->notify = notify;
 	vq->weak_barriers = weak_barriers;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index d45ee82a4470..a3f73bb6733e 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -20,6 +20,7 @@
  * @index: the zero-based ordinal number for this queue.
  * @num_free: number of elements we expect to be able to fit.
  * @num_max: the maximum number of elements supported by the device.
+ * @reset: vq is in reset state or not.
  *
  * A note on @num_free: with indirect buffers, each buffer needs one
  * element in the queue, otherwise a buffer will need one element per
@@ -34,6 +35,7 @@ struct virtqueue {
 	unsigned int num_free;
 	unsigned int num_max;
 	void *priv;
+	bool reset;
 };
 
 int virtqueue_add_outbuf(struct virtqueue *vq,
-- 
cgit v1.2.3


From 0cdd450e70510c9e13af8099e9f6c1467e6a0b91 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:49 +0800
Subject: virtio_pci: struct virtio_pci_common_cfg add queue_reset

Add queue_reset in virtio_pci_modern_common_cfg.

 https://github.com/oasis-tcs/virtio-spec/issues/124
 https://github.com/oasis-tcs/virtio-spec/issues/139

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-30-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_pci_modern.h | 2 +-
 include/uapi/linux/virtio_pci.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 41f5a018bd94..05123b9a606f 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -9,7 +9,7 @@ struct virtio_pci_modern_common_cfg {
 	struct virtio_pci_common_cfg cfg;
 
 	__le16 queue_notify_data;	/* read-write */
-	__le16 padding;
+	__le16 queue_reset;		/* read-write */
 };
 
 struct virtio_pci_modern_device {
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index f5981a874481..f703afc7ad31 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -203,6 +203,7 @@ struct virtio_pci_cfg_cap {
 #define VIRTIO_PCI_COMMON_Q_USEDLO	48
 #define VIRTIO_PCI_COMMON_Q_USEDHI	52
 #define VIRTIO_PCI_COMMON_Q_NDATA	56
+#define VIRTIO_PCI_COMMON_Q_RESET	58
 
 #endif /* VIRTIO_PCI_NO_MODERN */
 
-- 
cgit v1.2.3


From 0b50cece0b7857732d2055f2c77f8730c10f9196 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:50 +0800
Subject: virtio_pci: introduce helper to get/set queue reset

Introduce new helpers to implement queue reset and get queue reset
status.

 https://github.com/oasis-tcs/virtio-spec/issues/124
 https://github.com/oasis-tcs/virtio-spec/issues/139

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-31-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 39 ++++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_modern.h      |  2 ++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index fa2a9445bb18..869cb46bef96 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -3,6 +3,7 @@
 #include <linux/virtio_pci_modern.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/delay.h>
 
 /*
  * vp_modern_map_capability - map a part of virtio pci capability
@@ -474,6 +475,44 @@ void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
 }
 EXPORT_SYMBOL_GPL(vp_modern_set_status);
 
+/*
+ * vp_modern_get_queue_reset - get the queue reset status
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+	vp_iowrite16(index, &cfg->cfg.queue_select);
+	return vp_ioread16(&cfg->queue_reset);
+}
+EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset);
+
+/*
+ * vp_modern_set_queue_reset - reset the queue
+ * @mdev: the modern virtio-pci device
+ * @index: queue index
+ */
+void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index)
+{
+	struct virtio_pci_modern_common_cfg __iomem *cfg;
+
+	cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common;
+
+	vp_iowrite16(index, &cfg->cfg.queue_select);
+	vp_iowrite16(1, &cfg->queue_reset);
+
+	while (vp_ioread16(&cfg->queue_reset))
+		msleep(1);
+
+	while (vp_ioread16(&cfg->cfg.queue_enable))
+		msleep(1);
+}
+EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset);
+
 /*
  * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue
  * @mdev: the modern virtio-pci device
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 05123b9a606f..c4eeb79b0139 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -113,4 +113,6 @@ void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
 				       u16 index, resource_size_t *pa);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
+int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
+void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index);
 #endif
-- 
cgit v1.2.3


From a10fba0377145fccefea4dc4dd5915b7ed87e546 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:53 +0800
Subject: virtio: find_vqs() add arg sizes

find_vqs() adds a new parameter sizes to specify the size of each vq
vring.

NULL as sizes means that all queues in find_vqs() use the maximum size.
A value in the array is 0, which means that the corresponding queue uses
the maximum size.

In the split scenario, the meaning of size is the largest size, because
it may be limited by memory, the virtio core will try a smaller size.
And the size is power of 2.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-34-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             |  2 +-
 drivers/platform/mellanox/mlxbf-tmfifo.c |  1 +
 drivers/remoteproc/remoteproc_virtio.c   |  1 +
 drivers/s390/virtio/virtio_ccw.c         |  1 +
 drivers/virtio/virtio_mmio.c             |  1 +
 drivers/virtio/virtio_pci_common.c       |  2 +-
 drivers/virtio/virtio_pci_common.h       |  2 +-
 drivers/virtio/virtio_pci_modern.c       |  7 +++++--
 drivers/virtio/virtio_vdpa.c             |  1 +
 include/linux/virtio_config.h            | 14 +++++++++-----
 10 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index e719af8bdf56..79e38afd4b91 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -1011,7 +1011,7 @@ error_kzalloc:
 
 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		       const char * const names[], const bool *ctx,
+		       const char * const names[], u32 sizes[], const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 1ae3c56b66b0..8be13d416f48 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -928,6 +928,7 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 					struct virtqueue *vqs[],
 					vq_callback_t *callbacks[],
 					const char * const names[],
+					u32 sizes[],
 					const bool *ctx,
 					struct irq_affinity *desc)
 {
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 0f7706e23eb9..81c4f5776109 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -158,6 +158,7 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
+				 u32 sizes[],
 				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 6b86d0280d6b..72500cd2dbf5 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -635,6 +635,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
+			       u32 sizes[],
 			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 3ff746e3f24a..dfcecfd7aba1 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -474,6 +474,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
+		       u32 sizes[],
 		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index ad258a9d3b9f..7ad734584823 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -396,7 +396,7 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], const bool *ctx,
+		const char * const names[], u32 sizes[], const bool *ctx,
 		struct irq_affinity *desc)
 {
 	int err;
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 23112d84218f..a5ff838b85a5 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -110,7 +110,7 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], const bool *ctx,
+		const char * const names[], u32 sizes[], const bool *ctx,
 		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index c3b9f2761849..be51ec849252 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -347,12 +347,15 @@ err:
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char * const names[], const bool *ctx,
+			      const char * const names[],
+			      u32 sizes[],
+			      const bool *ctx,
 			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx,
+			     desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 9670cc79371d..832d2c5b1b19 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -269,6 +269,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				struct virtqueue *vqs[],
 				vq_callback_t *callbacks[],
 				const char * const names[],
+				u32 sizes[],
 				const bool *ctx,
 				struct irq_affinity *desc)
 {
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..888f7e96f0c7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,6 +55,7 @@ struct virtio_shm_region {
  *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
  *		include a NULL entry for vqs unused by driver
+ *	sizes: array of virtqueue sizes
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @synchronize_cbs: synchronize with the virtqueue callbacks (optional)
@@ -103,7 +104,9 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[], const bool *ctx,
+			const char * const names[],
+			u32 sizes[],
+			const bool *ctx,
 			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
@@ -212,7 +215,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	const char *names[] = { n };
 	struct virtqueue *vq;
 	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
-					 NULL);
+					 NULL, NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -224,7 +227,8 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
+				      NULL, desc);
 }
 
 static inline
@@ -233,8 +237,8 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
-				      desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
+				      ctx, desc);
 }
 
 /**
-- 
cgit v1.2.3


From fe3dc04e31aa51f91dc7f741a5f76cc4817eb5b4 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Mon, 1 Aug 2022 14:38:56 +0800
Subject: virtio: add helper virtio_find_vqs_ctx_size()

Introduce helper virtio_find_vqs_ctx_size() to call find_vqs and specify
the maximum size of each vq ring.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220801063902.129329-37-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 888f7e96f0c7..6adff09f7170 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -241,6 +241,18 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 				      ctx, desc);
 }
 
+static inline
+int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs,
+			     struct virtqueue *vqs[],
+			     vq_callback_t *callbacks[],
+			     const char * const names[],
+			     u32 sizes[],
+			     const bool *ctx, struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes,
+				      ctx, desc);
+}
+
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
  * @vdev: the device
-- 
cgit v1.2.3


From cae15c2ed8e6e058bd5e32de292ab7982640161e Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 14 Jul 2022 14:39:26 +0300
Subject: vdpa/mlx5: Implement susupend virtqueue callback

Implement the suspend callback allowing to suspend the virtqueues so
they stop processing descriptors. This is required to allow to query a
consistent state of the virtqueue while live migration is taking place.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Message-Id: <20220714113927.85729-2-elic@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 83 ++++++++++++++++++++++++++++++++++++--
 include/linux/mlx5/mlx5_ifc_vdpa.h |  8 ++++
 2 files changed, 88 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 99bbbf38c8b1..a476e06476f6 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -164,6 +164,7 @@ struct mlx5_vdpa_net {
 	bool setup;
 	u32 cur_num_vqs;
 	u32 rqt_size;
+	bool nb_registered;
 	struct notifier_block nb;
 	struct vdpa_callback config_cb;
 	struct mlx5_vdpa_wq_ent cvq_ent;
@@ -895,6 +896,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	if (err)
 		goto err_cmd;
 
+	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
 	kfree(in);
 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
 
@@ -922,6 +924,7 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq
 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
 		return;
 	}
+	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
 	umems_destroy(ndev, mvq);
 }
 
@@ -1121,6 +1124,20 @@ err_cmd:
 	return err;
 }
 
+static bool is_valid_state_change(int oldstate, int newstate)
+{
+	switch (oldstate) {
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
+		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
+		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
+	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
+	default:
+		return false;
+	}
+}
+
 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
 {
 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
@@ -1130,6 +1147,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque
 	void *in;
 	int err;
 
+	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
+		return 0;
+
+	if (!is_valid_state_change(mvq->fw_state, state))
+		return -EINVAL;
+
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1992,6 +2015,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready
 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 	struct mlx5_vdpa_virtqueue *mvq;
+	int err;
 
 	if (!mvdev->actual_features)
 		return;
@@ -2005,8 +2029,16 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready
 	}
 
 	mvq = &ndev->vqs[idx];
-	if (!ready)
+	if (!ready) {
 		suspend_vq(ndev, mvq);
+	} else {
+		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
+		if (err) {
+			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
+			ready = false;
+		}
+	}
+
 
 	mvq->ready = ready;
 }
@@ -2733,6 +2765,37 @@ out_err:
 	return err;
 }
 
+static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
+{
+	struct mlx5_control_vq *cvq;
+
+	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
+		return;
+
+	cvq = &mvdev->cvq;
+	cvq->ready = false;
+}
+
+static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
+{
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+	struct mlx5_vdpa_virtqueue *mvq;
+	int i;
+
+	down_write(&ndev->reslock);
+	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+	ndev->nb_registered = false;
+	flush_workqueue(ndev->mvdev.wq);
+	for (i = 0; i < ndev->cur_num_vqs; i++) {
+		mvq = &ndev->vqs[i];
+		suspend_vq(ndev, mvq);
+	}
+	mlx5_vdpa_cvq_suspend(mvdev);
+	up_write(&ndev->reslock);
+	return 0;
+}
+
 static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.set_vq_address = mlx5_vdpa_set_vq_address,
 	.set_vq_num = mlx5_vdpa_set_vq_num,
@@ -2763,6 +2826,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.get_generation = mlx5_vdpa_get_generation,
 	.set_map = mlx5_vdpa_set_map,
 	.free = mlx5_vdpa_free,
+	.suspend = mlx5_vdpa_suspend,
 };
 
 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
@@ -2828,6 +2892,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev)
 		mvq->index = i;
 		mvq->ndev = ndev;
 		mvq->fwqp.fw = true;
+		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
 	}
 	for (; i < ndev->mvdev.max_vqs; i++) {
 		mvq = &ndev->vqs[i];
@@ -2902,13 +2967,21 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p
 		switch (eqe->sub_type) {
 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			down_read(&ndev->reslock);
+			if (!ndev->nb_registered) {
+				up_read(&ndev->reslock);
+				return NOTIFY_DONE;
+			}
 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
-			if (!wqent)
+			if (!wqent) {
+				up_read(&ndev->reslock);
 				return NOTIFY_DONE;
+			}
 
 			wqent->mvdev = &ndev->mvdev;
 			INIT_WORK(&wqent->work, update_carrier);
 			queue_work(ndev->mvdev.wq, &wqent->work);
+			up_read(&ndev->reslock);
 			ret = NOTIFY_OK;
 			break;
 		default:
@@ -3062,6 +3135,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
 
 	ndev->nb.notifier_call = event_handler;
 	mlx5_notifier_register(mdev, &ndev->nb);
+	ndev->nb_registered = true;
 	mvdev->vdev.mdev = &mgtdev->mgtdev;
 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
 	if (err)
@@ -3093,7 +3167,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 	struct workqueue_struct *wq;
 
-	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+	if (ndev->nb_registered) {
+		mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
+		ndev->nb_registered = false;
+	}
 	wq = mvdev->wq;
 	mvdev->wq = NULL;
 	destroy_workqueue(wq);
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 4414ed5b6ed2..9becdc3fa503 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -150,6 +150,14 @@ enum {
 	MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR      = 0x3,
 };
 
+/* This indicates that the object was not created or has already
+ * been desroyed. It is very safe to assume that this object will never
+ * have so many states
+ */
+enum {
+	MLX5_VIRTIO_NET_Q_OBJECT_NONE = 0xffffffff
+};
+
 enum {
 	MLX5_RQTC_LIST_Q_TYPE_RQ            = 0x0,
 	MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q  = 0x1,
-- 
cgit v1.2.3


From 848ecea184e1253758423b37cbfc1ed732ccf5b4 Mon Sep 17 00:00:00 2001
From: Eugenio Pérez <eperezma@redhat.com>
Date: Wed, 10 Aug 2022 19:15:09 +0200
Subject: vdpa: Add suspend operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This operation is optional: It it's not implemented, backend feature bit
will not be exposed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Message-Id: <20220810171512.2343333-2-eperezma@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 7b4a13d3bd91..d282f464d2f1 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -218,6 +218,9 @@ struct vdpa_map_file {
  * @reset:			Reset device
  *				@vdev: vdpa device
  *				Returns integer: success (0) or error (< 0)
+ * @suspend:			Suspend or resume the device (optional)
+ *				@vdev: vdpa device
+ *				Returns integer: success (0) or error (< 0)
  * @get_config_size:		Get the size of the configuration space includes
  *				fields that are conditional on feature bits.
  *				@vdev: vdpa device
@@ -319,6 +322,7 @@ struct vdpa_config_ops {
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
 	int (*reset)(struct vdpa_device *vdev);
+	int (*suspend)(struct vdpa_device *vdev);
 	size_t (*get_config_size)(struct vdpa_device *vdev);
 	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
 			   void *buf, unsigned int len);
-- 
cgit v1.2.3


From addebd9ac9ca0ef8b3764907bf8018e48caffc64 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 12 Aug 2022 15:56:33 -0700
Subject: fs: don't randomize struct kiocb fields

This is a size sensitive structure and randomizing can introduce extra
padding that breaks io_uring's fixed size expectations. There are few
fields here as it is, half of which need a fixed order to optimally
pack, so the randomization isn't providing much.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Link: https://lore.kernel.org/io-uring/b6f508ca-b1b2-5f40-7998-e4cff1cf7212@kernel.dk/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9f131e559d05..daf69a6504b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -339,17 +339,12 @@ enum rw_hint {
 
 struct kiocb {
 	struct file		*ki_filp;
-
-	/* The 'ki_filp' pointer is shared in a union for aio */
-	randomized_struct_fields_start
-
 	loff_t			ki_pos;
 	void (*ki_complete)(struct kiocb *iocb, long ret);
 	void			*private;
 	int			ki_flags;
 	u16			ki_ioprio; /* See linux/ioprio.h */
 	struct wait_page_queue	*ki_waitq; /* for async buffered IO */
-	randomized_struct_fields_end
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
-- 
cgit v1.2.3


From f2ccb5aed7bce1d8b3ed5b3385759a5509663028 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Thu, 11 Aug 2022 09:11:15 +0200
Subject: io_uring: make io_kiocb_to_cmd() typesafe

We need to make sure (at build time) that struct io_cmd_data is not
casted to a structure that's larger.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Link: https://lore.kernel.org/r/c024cdf25ae19fc0319d4180e2298bade8ed17b8.1660201408.git.metze@samba.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  9 ++++++++-
 io_uring/advise.c              |  8 ++++----
 io_uring/cancel.c              |  4 ++--
 io_uring/epoll.c               |  4 ++--
 io_uring/fs.c                  | 28 ++++++++++++++--------------
 io_uring/kbuf.c                |  8 ++++----
 io_uring/msg_ring.c            |  8 ++++----
 io_uring/net.c                 | 42 +++++++++++++++++++++---------------------
 io_uring/notif.c               |  2 --
 io_uring/notif.h               |  2 +-
 io_uring/openclose.c           | 16 ++++++++--------
 io_uring/poll.c                | 16 ++++++++--------
 io_uring/rsrc.c                | 10 +++++-----
 io_uring/rw.c                  | 28 ++++++++++++++--------------
 io_uring/splice.c              |  8 ++++----
 io_uring/statx.c               |  6 +++---
 io_uring/sync.c                | 12 ++++++------
 io_uring/timeout.c             | 26 +++++++++++++-------------
 io_uring/uring_cmd.c           |  8 ++++----
 io_uring/xattr.c               | 18 +++++++++---------
 20 files changed, 134 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..677a25d44d7f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -491,7 +491,14 @@ struct io_cmd_data {
 	__u8			data[56];
 };
 
-#define io_kiocb_to_cmd(req)	((void *) &(req)->cmd)
+static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
+{
+	BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data));
+}
+#define io_kiocb_to_cmd(req, cmd_type) ( \
+	io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \
+	((cmd_type *)&(req)->cmd) \
+)
 #define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
 
 struct io_kiocb {
diff --git a/io_uring/advise.c b/io_uring/advise.c
index 581956934c0b..449c6f14649f 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -31,7 +31,7 @@ struct io_madvise {
 int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
+	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
 
 	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
 		return -EINVAL;
@@ -48,7 +48,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
-	struct io_madvise *ma = io_kiocb_to_cmd(req);
+	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -64,7 +64,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
 
 	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 		return -EINVAL;
@@ -77,7 +77,7 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_fadvise *fa = io_kiocb_to_cmd(req);
+	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK) {
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 8435a1eba59a..e4e1dc0325f0 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -107,7 +107,7 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
 
 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+	struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);
 
 	if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
@@ -164,7 +164,7 @@ static int __io_async_cancel(struct io_cancel_data *cd,
 
 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_cancel *cancel = io_kiocb_to_cmd(req);
+	struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);
 	struct io_cancel_data cd = {
 		.ctx	= req->ctx,
 		.data	= cancel->addr,
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index a8b794471d6b..9aa74d2c80bc 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -23,7 +23,7 @@ struct io_epoll {
 
 int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_epoll *epoll = io_kiocb_to_cmd(req);
+	struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
 
 	pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
 		     "be removed in a future Linux kernel version.\n",
@@ -49,7 +49,7 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_epoll *ie = io_kiocb_to_cmd(req);
+	struct io_epoll *ie = io_kiocb_to_cmd(req, struct io_epoll);
 	int ret;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
diff --git a/io_uring/fs.c b/io_uring/fs.c
index 0de4f549bb7d..7100c293c13a 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -49,7 +49,7 @@ struct io_link {
 
 int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 	const char __user *oldf, *newf;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -79,7 +79,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -95,7 +95,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_renameat_cleanup(struct io_kiocb *req)
 {
-	struct io_rename *ren = io_kiocb_to_cmd(req);
+	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 
 	putname(ren->oldpath);
 	putname(ren->newpath);
@@ -103,7 +103,7 @@ void io_renameat_cleanup(struct io_kiocb *req)
 
 int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_unlink *un = io_kiocb_to_cmd(req);
+	struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
 	const char __user *fname;
 
 	if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
@@ -128,7 +128,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_unlink *un = io_kiocb_to_cmd(req);
+	struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -146,14 +146,14 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_unlinkat_cleanup(struct io_kiocb *req)
 {
-	struct io_unlink *ul = io_kiocb_to_cmd(req);
+	struct io_unlink *ul = io_kiocb_to_cmd(req, struct io_unlink);
 
 	putname(ul->filename);
 }
 
 int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
 	const char __user *fname;
 
 	if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -175,7 +175,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_mkdir *mkd = io_kiocb_to_cmd(req);
+	struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -190,14 +190,14 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_mkdirat_cleanup(struct io_kiocb *req)
 {
-	struct io_mkdir *md = io_kiocb_to_cmd(req);
+	struct io_mkdir *md = io_kiocb_to_cmd(req, struct io_mkdir);
 
 	putname(md->filename);
 }
 
 int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 	const char __user *oldpath, *newpath;
 
 	if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -225,7 +225,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -240,7 +240,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_link *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
 	const char __user *oldf, *newf;
 
 	if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
@@ -270,7 +270,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_link *lnk = io_kiocb_to_cmd(req);
+	struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -286,7 +286,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_link_cleanup(struct io_kiocb *req)
 {
-	struct io_link *sl = io_kiocb_to_cmd(req);
+	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 
 	putname(sl->oldpath);
 	putname(sl->newpath);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a73f40a4cfe6..25cd724ade18 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -272,7 +272,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
@@ -291,7 +291,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
@@ -319,7 +319,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	unsigned long size, tmp_check;
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	u64 tmp;
 
 	if (sqe->rw_flags || sqe->splice_fd_in)
@@ -421,7 +421,7 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 
 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_provide_buf *p = io_kiocb_to_cmd(req);
+	struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer_list *bl;
 	int ret = 0;
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 753d16734319..976c4ba68ee7 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -26,7 +26,7 @@ struct io_msg {
 static int io_msg_ring_data(struct io_kiocb *req)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 
 	if (msg->src_fd || msg->dst_fd || msg->flags)
 		return -EINVAL;
@@ -76,7 +76,7 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx,
 static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long file_ptr;
 	struct file *src_file;
@@ -122,7 +122,7 @@ out_unlock:
 
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 
 	if (unlikely(sqe->buf_index || sqe->personality))
 		return -EINVAL;
@@ -141,7 +141,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_msg *msg = io_kiocb_to_cmd(req);
+	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	int ret;
 
 	ret = -EBADFD;
diff --git a/io_uring/net.c b/io_uring/net.c
index e6fc9748fbd2..6d71748e2c5a 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -77,7 +77,7 @@ struct io_sendzc {
 
 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
 
 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
 		     sqe->buf_index || sqe->splice_fd_in))
@@ -89,7 +89,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
 	struct socket *sock;
 	int ret;
 
@@ -174,7 +174,7 @@ static int io_setup_async_msg(struct io_kiocb *req,
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	iomsg->msg.msg_name = &iomsg->addr;
 	iomsg->free_iov = iomsg->fast_iov;
@@ -201,7 +201,7 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
 
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -225,7 +225,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned flags;
@@ -284,7 +284,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct msghdr msg;
 	struct iovec iov;
 	struct socket *sock;
@@ -358,7 +358,7 @@ static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 				 struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct user_msghdr msg;
 	int ret;
 
@@ -405,7 +405,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 					struct io_async_msghdr *iomsg)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct compat_msghdr msg;
 	struct compat_iovec __user *uiov;
 	int ret;
@@ -483,7 +483,7 @@ int io_recvmsg_prep_async(struct io_kiocb *req)
 
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	if (unlikely(sqe->file_index || sqe->addr2))
 		return -EINVAL;
@@ -518,7 +518,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 static inline void io_recv_prep_retry(struct io_kiocb *req)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 
 	sr->done_io = 0;
 	sr->len = 0; /* get from the provided buffer */
@@ -647,7 +647,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
 
 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned int cflags;
@@ -759,7 +759,7 @@ retry_multishot:
 
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct msghdr msg;
 	struct socket *sock;
 	struct iovec iov;
@@ -850,7 +850,7 @@ out_free:
 
 int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
@@ -946,7 +946,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct sockaddr_storage address;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_sendzc *zc = io_kiocb_to_cmd(req);
+	struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
 	struct io_notif_slot *notif_slot;
 	struct io_kiocb *notif;
 	struct msghdr msg;
@@ -1037,7 +1037,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_accept *accept = io_kiocb_to_cmd(req);
+	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	unsigned flags;
 
 	if (sqe->len || sqe->buf_index)
@@ -1071,7 +1071,7 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_accept *accept = io_kiocb_to_cmd(req);
+	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	bool fixed = !!accept->file_slot;
@@ -1129,7 +1129,7 @@ retry:
 
 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_socket *sock = io_kiocb_to_cmd(req);
+	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
 
 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
@@ -1150,7 +1150,7 @@ int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_socket *sock = io_kiocb_to_cmd(req);
+	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
 	bool fixed = !!sock->file_slot;
 	struct file *file;
 	int ret, fd;
@@ -1184,14 +1184,14 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
 int io_connect_prep_async(struct io_kiocb *req)
 {
 	struct io_async_connect *io = req->async_data;
-	struct io_connect *conn = io_kiocb_to_cmd(req);
+	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
 
 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 }
 
 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_connect *conn = io_kiocb_to_cmd(req);
+	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
 
 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
@@ -1203,7 +1203,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_connect *connect = io_kiocb_to_cmd(req);
+	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
 	struct io_async_connect __io, *io;
 	unsigned file_flags;
 	int ret;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 48d29dead62a..977736e82c1a 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -123,8 +123,6 @@ __cold int io_notif_register(struct io_ring_ctx *ctx,
 	struct io_uring_notification_register reg;
 	unsigned i;
 
-	BUILD_BUG_ON(sizeof(struct io_notif_data) > 64);
-
 	if (ctx->nr_notif_slots)
 		return -EBUSY;
 	if (size != sizeof(reg))
diff --git a/io_uring/notif.h b/io_uring/notif.h
index 0819304d7e00..65f0b42f2555 100644
--- a/io_uring/notif.h
+++ b/io_uring/notif.h
@@ -46,7 +46,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
 
 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
 {
-	return io_kiocb_to_cmd(notif);
+	return io_kiocb_to_cmd(notif, struct io_notif_data);
 }
 
 static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index d1818ec9169b..67178e4bb282 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -33,7 +33,7 @@ struct io_close {
 
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	const char __user *fname;
 	int ret;
 
@@ -66,7 +66,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	u64 mode = READ_ONCE(sqe->len);
 	u64 flags = READ_ONCE(sqe->open_flags);
 
@@ -76,7 +76,7 @@ int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	struct open_how __user *how;
 	size_t len;
 	int ret;
@@ -95,7 +95,7 @@ int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 	struct open_flags op;
 	struct file *file;
 	bool resolve_nonblock, nonblock_set;
@@ -167,7 +167,7 @@ int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_open_cleanup(struct io_kiocb *req)
 {
-	struct io_open *open = io_kiocb_to_cmd(req);
+	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
 
 	if (open->filename)
 		putname(open->filename);
@@ -187,14 +187,14 @@ int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
 
 static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 
 	return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
 }
 
 int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 
 	if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
 		return -EINVAL;
@@ -212,7 +212,7 @@ int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
-	struct io_close *close = io_kiocb_to_cmd(req);
+	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
 	struct fdtable *fdt;
 	struct file *file;
 	int ret = -EBADF;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index dadd293749b0..d5bad0bea6e4 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -85,7 +85,7 @@ static struct io_poll *io_poll_get_double(struct io_kiocb *req)
 static struct io_poll *io_poll_get_single(struct io_kiocb *req)
 {
 	if (req->opcode == IORING_OP_POLL_ADD)
-		return io_kiocb_to_cmd(req);
+		return io_kiocb_to_cmd(req, struct io_poll);
 	return &req->apoll->poll;
 }
 
@@ -274,7 +274,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 		return;
 
 	if (ret == IOU_POLL_DONE) {
-		struct io_poll *poll = io_kiocb_to_cmd(req);
+		struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 		req->cqe.res = mangle_poll(req->cqe.res & poll->events);
 	} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
 		req->cqe.res = ret;
@@ -475,7 +475,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 			       struct poll_table_struct *p)
 {
 	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-	struct io_poll *poll = io_kiocb_to_cmd(pt->req);
+	struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll);
 
 	__io_queue_proc(poll, pt, head,
 			(struct io_poll **) &pt->req->async_data);
@@ -821,7 +821,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 
 int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll_update *upd = io_kiocb_to_cmd(req);
+	struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -851,7 +851,7 @@ int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_poll *poll = io_kiocb_to_cmd(req);
+	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 	u32 flags;
 
 	if (sqe->buf_index || sqe->off || sqe->addr)
@@ -868,7 +868,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_poll *poll = io_kiocb_to_cmd(req);
+	struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
 	struct io_poll_table ipt;
 	int ret;
 
@@ -891,7 +891,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
+	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
 	struct io_cancel_data cd = { .data = poll_update->old_user_data, };
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_hash_bucket *bucket;
@@ -930,7 +930,7 @@ found:
 	if (poll_update->update_events || poll_update->update_user_data) {
 		/* only mask one event flags, keep behavior flags */
 		if (poll_update->update_events) {
-			struct io_poll *poll = io_kiocb_to_cmd(preq);
+			struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll);
 
 			poll->events &= ~0xffff;
 			poll->events |= poll_update->events & 0xffff;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..71359a4d0bd4 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -657,7 +657,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 
 int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
@@ -676,7 +676,7 @@ int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_files_update_with_index_alloc(struct io_kiocb *req,
 					    unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	__s32 __user *fds = u64_to_user_ptr(up->arg);
 	unsigned int done;
 	struct file *file;
@@ -714,7 +714,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 
 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_rsrc_update2 up2;
 	int ret;
@@ -743,7 +743,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned len = up->nr_args;
 	unsigned idx_end, idx = up->offset;
@@ -778,7 +778,7 @@ out:
 
 int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
+	struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
 
 	switch (up->type) {
 	case IORING_RSRC_UPDATE_FILES:
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..3d732b19b760 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -35,7 +35,7 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req)
 
 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
 	int ret;
 
@@ -102,7 +102,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 
 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (rw->kiocb.ki_pos != -1)
 		return &rw->kiocb.ki_pos;
@@ -186,7 +186,7 @@ static void kiocb_end_write(struct io_kiocb *req)
 
 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (rw->kiocb.ki_flags & IOCB_WRITE) {
 		kiocb_end_write(req);
@@ -241,7 +241,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 		       unsigned int issue_flags)
 {
 	struct io_async_rw *io = req->async_data;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	/* add previously done IO, if any */
 	if (req_has_async_data(req) && io->bytes_done > 0) {
@@ -277,7 +277,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 				unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct compat_iovec __user *uiov;
 	compat_ssize_t clen;
 	void __user *buf;
@@ -305,7 +305,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				      unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
 	void __user *buf;
 	ssize_t len;
@@ -328,7 +328,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 				    unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
 		iov[0].iov_base = u64_to_user_ptr(rw->addr);
@@ -350,7 +350,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 				       struct io_rw_state *s,
 				       unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct iov_iter *iter = &s->iter;
 	u8 opcode = req->opcode;
 	struct iovec *iovec;
@@ -571,7 +571,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 {
 	struct wait_page_queue *wpq;
 	struct io_kiocb *req = wait->private;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct wait_page_key *key = arg;
 
 	wpq = container_of(wait, struct wait_page_queue, wait);
@@ -601,7 +601,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
 	struct wait_page_queue *wait = &io->wpq;
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct kiocb *kiocb = &rw->kiocb;
 
 	/* never retry for NOWAIT, we just complete with -EAGAIN */
@@ -649,7 +649,7 @@ static bool need_complete_io(struct io_kiocb *req)
 
 static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct kiocb *kiocb = &rw->kiocb;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
@@ -694,7 +694,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 
 int io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
 	struct kiocb *kiocb = &rw->kiocb;
@@ -839,7 +839,7 @@ done:
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_rw *rw = io_kiocb_to_cmd(req);
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_rw_state __s, *s = &__s;
 	struct iovec *iovec;
 	struct kiocb *kiocb = &rw->kiocb;
@@ -994,7 +994,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct io_rw *rw = io_kiocb_to_cmd(req);
+		struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 		int ret;
 
 		/*
diff --git a/io_uring/splice.c b/io_uring/splice.c
index b013ba34bffa..53e4232d0866 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -26,7 +26,7 @@ struct io_splice {
 static int __io_splice_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 
 	sp->len = READ_ONCE(sqe->len);
@@ -46,7 +46,7 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	struct file *in;
@@ -78,7 +78,7 @@ done:
 
 int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 
 	sp->off_in = READ_ONCE(sqe->splice_off_in);
 	sp->off_out = READ_ONCE(sqe->off);
@@ -87,7 +87,7 @@ int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_splice *sp = io_kiocb_to_cmd(req);
+	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct file *out = sp->file_out;
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	loff_t *poff_in, *poff_out;
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6056cd7f4876..d8fc933d3f59 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -22,7 +22,7 @@ struct io_statx {
 
 int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 	const char __user *path;
 
 	if (sqe->buf_index || sqe->splice_fd_in)
@@ -53,7 +53,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -66,7 +66,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 
 void io_statx_cleanup(struct io_kiocb *req)
 {
-	struct io_statx *sx = io_kiocb_to_cmd(req);
+	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 
 	if (sx->filename)
 		putname(sx->filename);
diff --git a/io_uring/sync.c b/io_uring/sync.c
index f2102afa79ca..64e87ea2b8fb 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -24,7 +24,7 @@ struct io_sync {
 
 int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
@@ -37,7 +37,7 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	int ret;
 
 	/* sync_file_range always requires a blocking context */
@@ -51,7 +51,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
 		return -EINVAL;
@@ -67,7 +67,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	loff_t end = sync->off + sync->len;
 	int ret;
 
@@ -83,7 +83,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 
 	if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
@@ -96,7 +96,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_sync *sync = io_kiocb_to_cmd(req);
+	struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync);
 	int ret;
 
 	/* fallocate always requiring blocking context */
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2f9e56935479..78ea2c64b70e 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -36,7 +36,7 @@ struct io_timeout_rem {
 
 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 
 	return !timeout->off;
 }
@@ -56,7 +56,7 @@ static bool io_kill_timeout(struct io_kiocb *req, int status)
 	struct io_timeout_data *io = req->async_data;
 
 	if (hrtimer_try_to_cancel(&io->timer) != -1) {
-		struct io_timeout *timeout = io_kiocb_to_cmd(req);
+		struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 
 		if (status)
 			req_set_fail(req);
@@ -188,7 +188,7 @@ struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
 	__must_hold(&req->ctx->timeout_lock)
 {
 	struct io_timeout_data *io = link->async_data;
-	struct io_timeout *timeout = io_kiocb_to_cmd(link);
+	struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout);
 
 	io_remove_next_linked(req);
 	timeout->head = NULL;
@@ -205,7 +205,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
@@ -252,7 +252,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 	io = req->async_data;
 	if (hrtimer_try_to_cancel(&io->timer) == -1)
 		return ERR_PTR(-EALREADY);
-	timeout = io_kiocb_to_cmd(req);
+	timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	list_del_init(&timeout->list);
 	return req;
 }
@@ -275,7 +275,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 {
 	unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
 
@@ -302,7 +302,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	struct io_timeout_data *data = container_of(timer,
 						struct io_timeout_data, timer);
 	struct io_kiocb *prev, *req = data->req;
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
@@ -378,7 +378,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 {
 	struct io_cancel_data cd = { .data = user_data, };
 	struct io_kiocb *req = io_timeout_extract(ctx, &cd);
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data;
 
 	if (IS_ERR(req))
@@ -395,7 +395,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 
 int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
 
 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 		return -EINVAL;
@@ -435,7 +435,7 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
  */
 int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
+	struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem);
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
@@ -466,7 +466,7 @@ static int __io_timeout_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe,
 			     bool is_timeout_link)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_timeout_data *data;
 	unsigned flags;
 	u32 off = READ_ONCE(sqe->off);
@@ -532,7 +532,7 @@ int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data = req->async_data;
 	struct list_head *entry;
@@ -583,7 +583,7 @@ add:
 
 void io_queue_linked_timeout(struct io_kiocb *req)
 {
-	struct io_timeout *timeout = io_kiocb_to_cmd(req);
+	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_ring_ctx *ctx = req->ctx;
 
 	spin_lock_irq(&ctx->timeout_lock);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ee7036f2241f..478e86a9dfaf 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -11,7 +11,7 @@
 
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 
 	ioucmd->task_work_cb(ioucmd);
 }
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
 int io_uring_cmd_prep_async(struct io_kiocb *req)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	size_t cmd_size;
 
 	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
@@ -66,7 +66,7 @@ int io_uring_cmd_prep_async(struct io_kiocb *req)
 
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 
 	if (sqe->rw_flags || sqe->__pad1)
 		return -EINVAL;
@@ -77,7 +77,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct file *file = req->file;
 	int ret;
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index b179f9acd5ac..84180afd090b 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -24,7 +24,7 @@ struct io_xattr {
 
 void io_xattr_cleanup(struct io_kiocb *req)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 
 	if (ix->filename)
 		putname(ix->filename);
@@ -44,7 +44,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
 static int __io_getxattr_prep(struct io_kiocb *req,
 			      const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *name;
 	int ret;
 
@@ -85,7 +85,7 @@ int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *path;
 	int ret;
 
@@ -106,7 +106,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	int ret;
 
 	if (issue_flags & IO_URING_F_NONBLOCK)
@@ -122,7 +122,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;
@@ -151,7 +151,7 @@ retry:
 static int __io_setxattr_prep(struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *name;
 	int ret;
 
@@ -181,7 +181,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
 
 int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	const char __user *path;
 	int ret;
 
@@ -208,7 +208,7 @@ int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
 			struct path *path)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	int ret;
 
 	ret = mnt_want_write(path->mnt);
@@ -234,7 +234,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_xattr *ix = io_kiocb_to_cmd(req);
+	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	unsigned int lookup_flags = LOOKUP_FOLLOW;
 	struct path path;
 	int ret;
-- 
cgit v1.2.3


From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:22:25 -0400
Subject: NFS: Fix another fsync() issue after a server reboot

Currently, when the writeback code detects a server reboot, it redirties
any pages that were not committed to disk, and it sets the flag
NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
that dirtied the file. While this allows the file descriptor in question
to redrive its own writes, it violates the fsync() requirement that we
should be synchronising all writes to disk.
While the problem is infrequent, we do see corner cases where an
untimely server reboot causes the fsync() call to abandon its attempt to
sync data to disk and causing data corruption issues due to missed error
conditions or similar.

In order to tighted up the client's ability to deal with this situation
without introducing livelocks, add a counter that records the number of
times pages are redirtied due to a server reboot-like condition, and use
that in fsync() to redrive the sync to disk.

Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/file.c          | 15 ++++++---------
 fs/nfs/inode.c         |  1 +
 fs/nfs/write.c         |  6 ++++--
 include/linux/nfs_fs.h |  1 +
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 54237a231687..749e5487df50 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
 int
 nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = file_inode(file);
+	struct nfs_inode *nfsi = NFS_I(inode);
+	long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+	long nredirtied;
 	int ret;
 
 	trace_nfs_fsync_enter(inode);
@@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = pnfs_sync_inode(inode, !!datasync);
 		if (ret != 0)
 			break;
-		if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+		nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+		if (nredirtied == save_nredirtied)
 			break;
-		/*
-		 * If nfs_file_fsync_commit detected a server reboot, then
-		 * resend all dirty pages that might have been covered by
-		 * the NFS_CONTEXT_RESEND_WRITES flag
-		 */
-		start = 0;
-		end = LLONG_MAX;
+		save_nredirtied = nredirtied;
 	}
 
 	trace_nfs_fsync_exit(inode, ret);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4e46b0ffa2d..bea7c005119c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
 static void nfs_inode_init_regular(struct nfs_inode *nfsi)
 {
 	atomic_long_set(&nfsi->nrequests, 0);
+	atomic_long_set(&nfsi->redirtied_pages, 0);
 	INIT_LIST_HEAD(&nfsi->commit_info.list);
 	atomic_long_set(&nfsi->commit_info.ncommit, 0);
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4a3796811b4b..989c734cf91d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1420,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+	struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+
 	/* Bump the transmission count */
 	req->wb_nio++;
 	nfs_mark_request_dirty(req);
-	set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+	atomic_long_inc(&nfsi->redirtied_pages);
 	nfs_end_page_writeback(req);
 	nfs_release_request(req);
 }
@@ -1904,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		/* We have a mismatch. Write the page again */
 		dprintk_cont(" mismatch\n");
 		nfs_mark_request_dirty(req);
-		set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+		atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
 	next:
 		nfs_unlock_and_release_request(req);
 		/* Latency breaker */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b32ed68e7dc4..f08e581f0161 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -182,6 +182,7 @@ struct nfs_inode {
 		/* Regular file */
 		struct {
 			atomic_long_t	nrequests;
+			atomic_long_t	redirtied_pages;
 			struct nfs_mds_commit_info commit_info;
 			struct mutex	commit_mutex;
 		};
-- 
cgit v1.2.3


From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 13 Aug 2022 08:51:45 -0400
Subject: NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f08e581f0161..7931fa472561 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -83,7 +83,6 @@ struct nfs_open_context {
 	fmode_t mode;
 
 	unsigned long flags;
-#define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
 #define NFS_CONTEXT_FILE_OPEN		(4)
-- 
cgit v1.2.3


From 9f162193d6e48eb4ff51c2ea3612f1daebca1b7e Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 11 Aug 2022 22:34:25 -0700
Subject: radix-tree: replace gfp.h inclusion with gfp_types.h

Radix tree header includes gfp.h for __GFP_BITS_SHIFT only. Now we
have gfp_types.h for this.

Fixes powerpc allmodconfig build:

   In file included from include/linux/nodemask.h:97,
                    from include/linux/mmzone.h:17,
                    from include/linux/gfp.h:7,
                    from include/linux/radix-tree.h:12,
                    from include/linux/idr.h:15,
                    from include/linux/kernfs.h:12,
                    from include/linux/sysfs.h:16,
                    from include/linux/kobject.h:20,
                    from include/linux/pci.h:35,
                    from arch/powerpc/kernel/prom_init.c:24:
   include/linux/random.h: In function 'add_latent_entropy':
>> include/linux/random.h:25:46: error: 'latent_entropy' undeclared (first use in this function); did you mean 'add_latent_entropy'?
      25 |         add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
         |                                              ^~~~~~~~~~~~~~
         |                                              add_latent_entropy
   include/linux/random.h:25:46: note: each undeclared identifier is reported only once for each function it appears in

Reported-by: kernel test robot <lkp@intel.com>
CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f7c1d21c2f39..eae67015ce51 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -9,7 +9,7 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/bitops.h>
-#include <linux/gfp.h>
+#include <linux/gfp_types.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/math.h>
-- 
cgit v1.2.3


From be599244865cb788abe2c6f59ccb05d7240448e4 Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:33 +0200
Subject: cpumask: align signatures of UP implementations

Between the generic version, and their uniprocessor optimised
implementations, the return types of cpumask_any_and_distribute() and
cpumask_any_distribute() are not identical.  Change the UP versions to
'unsigned int', to match the generic versions.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 0d435d0edbcb..d8c2a40f8beb 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -202,12 +202,13 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	return 0;
 }
 
-static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
-					     const struct cpumask *src2p) {
+static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
+						      const struct cpumask *src2p)
+{
 	return cpumask_first_and(src1p, src2p);
 }
 
-static inline int cpumask_any_distribute(const struct cpumask *srcp)
+static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
 {
 	return cpumask_first(srcp);
 }
-- 
cgit v1.2.3


From 2248ccd80124e61c2c84a22b22409bab452e1f0c Mon Sep 17 00:00:00 2001
From: Sander Vanheule <sander@svanheule.net>
Date: Tue, 9 Aug 2022 19:36:34 +0200
Subject: lib/cpumask: add inline cpumask_next_wrap() for UP

In the uniprocessor case, cpumask_next_wrap() can be simplified, as the
number of valid argument combinations is limited:
    - 'start' can only be 0
    - 'n' can only be -1 or 0

The only valid CPU that can then be returned, if any, will be the first
one set in the provided 'mask'.

For NR_CPUS == 1, include/linux/cpumask.h now provides an inline
definition of cpumask_next_wrap(), which will conflict with the one
provided by lib/cpumask.c.  Make building of lib/cpumask.o again depend
on CONFIG_SMP=y (i.e. NR_CPUS > 1) to avoid the re-definition.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Sander Vanheule <sander@svanheule.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 19 +++++++++++++++++++
 lib/Makefile            |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d8c2a40f8beb..bd047864c7ac 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -262,7 +262,26 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
+#if NR_CPUS == 1
+static inline
+unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+	cpumask_check(start);
+	if (n != -1)
+		cpumask_check(n);
+
+	/*
+	 * Return the first available CPU when wrapping, or when starting before cpu0,
+	 * since there is only one valid option.
+	 */
+	if (wrap && n >= 0)
+		return nr_cpumask_bits;
+
+	return cpumask_first(mask);
+}
+#else
 unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+#endif
 
 /**
  * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
diff --git a/lib/Makefile b/lib/Makefile
index c95212141928..5927d7fa0806 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,9 +34,10 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o cpumask.o
+	 buildid.o
 
 lib-$(CONFIG_PRINTK) += dump_stack.o
+lib-$(CONFIG_SMP) += cpumask.o
 
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
-- 
cgit v1.2.3


From 76b079ef4cc954fc2c2e0333a01855b0b2b6bdee Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:09 +0800
Subject: sched/psi: Remove unused parameter nbytes of psi_trigger_create()

psi_trigger_create()'s 'nbytes' parameter is not used, so we can remove it.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/psi.h    | 2 +-
 kernel/cgroup/cgroup.c | 2 +-
 kernel/sched/psi.c     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 89784763d19e..dd74411ac21d 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -27,7 +27,7 @@ void psi_memstall_leave(unsigned long *flags);
 
 int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, size_t nbytes, enum psi_res res);
+			char *buf, enum psi_res res);
 void psi_trigger_destroy(struct psi_trigger *t);
 
 __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ffaccd6373f1..df7df5843b4f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3698,7 +3698,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 	}
 
 	psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
-	new = psi_trigger_create(psi, buf, nbytes, res);
+	new = psi_trigger_create(psi, buf, res);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
 		return PTR_ERR(new);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 5ee615a59fe1..ecb4b4ff4ce0 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1087,7 +1087,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 }
 
 struct psi_trigger *psi_trigger_create(struct psi_group *group,
-			char *buf, size_t nbytes, enum psi_res res)
+			char *buf, enum psi_res res)
 {
 	struct psi_trigger *t;
 	enum psi_states state;
@@ -1316,7 +1316,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 		return -EBUSY;
 	}
 
-	new = psi_trigger_create(&psi_system, buf, nbytes, res);
+	new = psi_trigger_create(&psi_system, buf, res);
 	if (IS_ERR(new)) {
 		mutex_unlock(&seq->lock);
 		return PTR_ERR(new);
-- 
cgit v1.2.3


From d7ae5818c3fa3007dee13f9d99832e7f26b8bc44 Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Sat, 6 Aug 2022 20:05:10 +0800
Subject: sched/psi: Remove redundant cgroup_psi() when !CONFIG_CGROUPS

cgroup_psi() is only called under CONFIG_CGROUPS.
We don't need cgroup_psi() when !CONFIG_CGROUPS,
so we can remove it in this case.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ed53bfe7c46c..ac5d0515680e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -734,11 +734,6 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
 	return NULL;
 }
 
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
-	return NULL;
-}
-
 static inline bool cgroup_psi_enabled(void)
 {
 	return false;
-- 
cgit v1.2.3


From 484b9fa4886bd9377969aad5e9ea17efda4ecda6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:31 -0400
Subject: virtio: Revert "virtio: add helper virtio_find_vqs_ctx_size()"

This reverts commit fe3dc04e31aa51f91dc7f741a5f76cc4817eb5b4: the
API is now unused and in fact can't be implemented on top of a legacy
device.

Fixes: fe3dc04e31aa ("virtio: add helper virtio_find_vqs_ctx_size()")
Cc: "Xuan Zhuo" <xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-3-mst@redhat.com>
---
 include/linux/virtio_config.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 6adff09f7170..888f7e96f0c7 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -241,18 +241,6 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 				      ctx, desc);
 }
 
-static inline
-int virtio_find_vqs_ctx_size(struct virtio_device *vdev, u32 nvqs,
-			     struct virtqueue *vqs[],
-			     vq_callback_t *callbacks[],
-			     const char * const names[],
-			     u32 sizes[],
-			     const bool *ctx, struct irq_affinity *desc)
-{
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, sizes,
-				      ctx, desc);
-}
-
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
  * @vdev: the device
-- 
cgit v1.2.3


From 9993a4f989c7ca5e227329b2878f65d05c9fc20f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 16 Aug 2022 01:36:58 -0400
Subject: virtio: Revert "virtio: find_vqs() add arg sizes"

This reverts commit a10fba0377145fccefea4dc4dd5915b7ed87e546: the
proposed API isn't supported on all transports but no
effort was made to address this.

It might not be hard to fix if we want to: maybe just
rename size to size_hint and make sure legacy
transports ignore the hint.

But it's not sure what the benefit is in any case, so
let's drop it.

Fixes: a10fba037714 ("virtio: find_vqs() add arg sizes")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20220816053602.173815-8-mst@redhat.com>
---
 arch/um/drivers/virtio_uml.c             |  2 +-
 drivers/platform/mellanox/mlxbf-tmfifo.c |  1 -
 drivers/remoteproc/remoteproc_virtio.c   |  1 -
 drivers/s390/virtio/virtio_ccw.c         |  1 -
 drivers/virtio/virtio_mmio.c             |  1 -
 drivers/virtio/virtio_pci_common.c       |  2 +-
 drivers/virtio/virtio_pci_common.h       |  2 +-
 drivers/virtio/virtio_pci_modern.c       |  7 ++-----
 drivers/virtio/virtio_vdpa.c             |  1 -
 include/linux/virtio_config.h            | 14 +++++---------
 10 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 79e38afd4b91..e719af8bdf56 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -1011,7 +1011,7 @@ error_kzalloc:
 
 static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		       const char * const names[], u32 sizes[], const bool *ctx,
+		       const char * const names[], const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev);
diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c
index 8be13d416f48..1ae3c56b66b0 100644
--- a/drivers/platform/mellanox/mlxbf-tmfifo.c
+++ b/drivers/platform/mellanox/mlxbf-tmfifo.c
@@ -928,7 +928,6 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev,
 					struct virtqueue *vqs[],
 					vq_callback_t *callbacks[],
 					const char * const names[],
-					u32 sizes[],
 					const bool *ctx,
 					struct irq_affinity *desc)
 {
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 81c4f5776109..0f7706e23eb9 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -158,7 +158,6 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
-				 u32 sizes[],
 				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 896896e32664..a10dbe632ef9 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -637,7 +637,6 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
-			       u32 sizes[],
 			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index dfcecfd7aba1..3ff746e3f24a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -474,7 +474,6 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
-		       u32 sizes[],
 		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 7ad734584823..ad258a9d3b9f 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -396,7 +396,7 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], u32 sizes[], const bool *ctx,
+		const char * const names[], const bool *ctx,
 		struct irq_affinity *desc)
 {
 	int err;
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index a5ff838b85a5..23112d84218f 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -110,7 +110,7 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], u32 sizes[], const bool *ctx,
+		const char * const names[], const bool *ctx,
 		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index be51ec849252..c3b9f2761849 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -347,15 +347,12 @@ err:
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 			      struct virtqueue *vqs[],
 			      vq_callback_t *callbacks[],
-			      const char * const names[],
-			      u32 sizes[],
-			      const bool *ctx,
+			      const char * const names[], const bool *ctx,
 			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx,
-			     desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 832d2c5b1b19..9670cc79371d 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -269,7 +269,6 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				struct virtqueue *vqs[],
 				vq_callback_t *callbacks[],
 				const char * const names[],
-				u32 sizes[],
 				const bool *ctx,
 				struct irq_affinity *desc)
 {
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 888f7e96f0c7..36ec7be1f480 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -55,7 +55,6 @@ struct virtio_shm_region {
  *		include a NULL entry for vqs that do not need a callback
  *	names: array of virtqueue names (mainly for debugging)
  *		include a NULL entry for vqs unused by driver
- *	sizes: array of virtqueue sizes
  *	Returns 0 on success or error status
  * @del_vqs: free virtqueues found by find_vqs().
  * @synchronize_cbs: synchronize with the virtqueue callbacks (optional)
@@ -104,9 +103,7 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[],
-			u32 sizes[],
-			const bool *ctx,
+			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
@@ -215,7 +212,7 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	const char *names[] = { n };
 	struct virtqueue *vq;
 	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
-					 NULL, NULL);
+					 NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -227,8 +224,7 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      NULL, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
 }
 
 static inline
@@ -237,8 +233,8 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[], const bool *ctx,
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL,
-				      ctx, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
+				      desc);
 }
 
 /**
-- 
cgit v1.2.3


From 5c669c4a4c6aa0489848093c93b8029f5c5c75ec Mon Sep 17 00:00:00 2001
From: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Date: Wed, 10 Aug 2022 11:40:03 +0200
Subject: virtio: kerneldocs fixes and enhancements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix variable names in some kerneldocs, naming in others.
Add kerneldocs for struct vring_desc and vring_interrupt.

Signed-off-by: Ricardo Cañuelo <ricardo.canuelo@collabora.com>
Message-Id: <20220810094004.1250-2-ricardo.canuelo@collabora.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 drivers/virtio/virtio_ring.c     |  8 ++++++++
 include/linux/virtio.h           |  6 +++---
 include/linux/virtio_config.h    |  6 +++---
 include/uapi/linux/virtio_ring.h | 16 +++++++++++-----
 4 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d66c8e6d0ef3..4620e9d79dde 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2426,6 +2426,14 @@ static inline bool more_used(const struct vring_virtqueue *vq)
 	return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
 }
 
+/**
+ * vring_interrupt - notify a virtqueue on an interrupt
+ * @irq: the IRQ number (ignored)
+ * @_vq: the struct virtqueue to notify
+ *
+ * Calls the callback function of @_vq to process the virtqueue
+ * notification.
+ */
 irqreturn_t vring_interrupt(int irq, void *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a3f73bb6733e..dcab9c7e8784 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,7 +11,7 @@
 #include <linux/gfp.h>
 
 /**
- * virtqueue - a queue to register buffers for sending or receiving.
+ * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
  * @callback: the function to call when buffers are consumed (can be NULL).
  * @name: the name of this virtqueue (mainly for debugging)
@@ -97,7 +97,7 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
 
 /**
- * virtio_device - representation of a device using virtio
+ * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
  * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
  * @config_enabled: configuration change reporting enabled
@@ -156,7 +156,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
 	list_for_each_entry(vq, &vdev->vqs, list)
 
 /**
- * virtio_driver - operations for a virtio I/O driver
+ * struct virtio_driver - operations for a virtio I/O driver
  * @driver: underlying device driver (populate name and owner).
  * @id_table: the ids serviced by this driver.
  * @feature_table: an array of feature numbers supported by this driver.
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 36ec7be1f480..4b517649cfe8 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -239,7 +239,7 @@ int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
 
 /**
  * virtio_synchronize_cbs - synchronize with virtqueue callbacks
- * @vdev: the device
+ * @dev: the virtio device
  */
 static inline
 void virtio_synchronize_cbs(struct virtio_device *dev)
@@ -258,7 +258,7 @@ void virtio_synchronize_cbs(struct virtio_device *dev)
 
 /**
  * virtio_device_ready - enable vq use in probe function
- * @vdev: the device
+ * @dev: the virtio device
  *
  * Driver must call this to use vqs in the probe function.
  *
@@ -306,7 +306,7 @@ const char *virtio_bus_name(struct virtio_device *vdev)
 /**
  * virtqueue_set_affinity - setting affinity for a virtqueue
  * @vq: the virtqueue
- * @cpu: the cpu no.
+ * @cpu_mask: the cpu mask
  *
  * Pay attention the function are best-effort: the affinity hint may not be set
  * due to config support, irq type and sharing.
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 476d3e5c0fe7..f8c20d3de8da 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -93,15 +93,21 @@
 #define VRING_USED_ALIGN_SIZE 4
 #define VRING_DESC_ALIGN_SIZE 16
 
-/* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
+/**
+ * struct vring_desc - Virtio ring descriptors,
+ * 16 bytes long. These can chain together via @next.
+ *
+ * @addr: buffer address (guest-physical)
+ * @len: buffer length
+ * @flags: descriptor flags
+ * @next: index of the next descriptor in the chain,
+ *        if the VRING_DESC_F_NEXT flag is set. We chain unused
+ *        descriptors via this, too.
+ */
 struct vring_desc {
-	/* Address (guest-physical). */
 	__virtio64 addr;
-	/* Length. */
 	__virtio32 len;
-	/* The flags as indicated above. */
 	__virtio16 flags;
-	/* We chain unused descriptors via this, too */
 	__virtio16 next;
 };
 
-- 
cgit v1.2.3


From a8239f0342bae5a51acca967ba95b9a8ad56dd62 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 18 Aug 2022 14:35:55 +0800
Subject: blk-mq: remove unused function blk_mq_queue_stopped()

blk_mq_queue_stopped() doesn't have any caller, which was found by
code coverage test, thus remove it.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20220818063555.3741222-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 20 --------------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ee62b95f3e5..5568c7d09114 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2229,26 +2229,6 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
 
-/**
- * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
- * @q: request queue.
- *
- * The caller is responsible for serializing this function against
- * blk_mq_{start,stop}_hw_queue().
- */
-bool blk_mq_queue_stopped(struct request_queue *q)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		if (blk_mq_hctx_stopped(hctx))
-			return true;
-
-	return false;
-}
-EXPORT_SYMBOL(blk_mq_queue_stopped);
-
 /*
  * This function is often used for pausing .queue_rq() by driver when
  * there isn't enough resource or some conditions aren't satisfied, and
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index effee1dc715a..92294a5fb083 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -857,7 +857,6 @@ void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_complete_request(struct request *rq);
 bool blk_mq_complete_request_remote(struct request *rq);
-bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-- 
cgit v1.2.3


From bdd1c37a315bc50ab14066c4852bc8dcf070451e Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:21 +0800
Subject: KVM: Rename KVM_PRIVATE_MEM_SLOTS to KVM_INTERNAL_MEM_SLOTS

KVM_INTERNAL_MEM_SLOTS better reflects the fact those slots are KVM
internally used (invisible to userspace) and avoids confusion to future
private slots that can have different meaning.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-2-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 include/linux/kvm_host.h        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ffa578cafe1..2c96c43c313a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -53,7 +53,7 @@
 #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
 
 /* memory slots that are not exposed to userspace */
-#define KVM_PRIVATE_MEM_SLOTS 3
+#define KVM_INTERNAL_MEM_SLOTS 3
 
 #define KVM_HALT_POLL_NS_DEFAULT 200000
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c480b1821e1..fd5c3b4715f8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -656,12 +656,12 @@ struct kvm_irq_routing_table {
 };
 #endif
 
-#ifndef KVM_PRIVATE_MEM_SLOTS
-#define KVM_PRIVATE_MEM_SLOTS 0
+#ifndef KVM_INTERNAL_MEM_SLOTS
+#define KVM_INTERNAL_MEM_SLOTS 0
 #endif
 
 #define KVM_MEM_SLOTS_NUM SHRT_MAX
-#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_PRIVATE_MEM_SLOTS)
+#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)
 
 #ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 20ec3ebd707c77fb9b11b37193449193d4649f33 Mon Sep 17 00:00:00 2001
From: Chao Peng <chao.p.peng@linux.intel.com>
Date: Tue, 16 Aug 2022 20:53:22 +0800
Subject: KVM: Rename mmu_notifier_* to mmu_invalidate_*

The motivation of this renaming is to make these variables and related
helper functions less mmu_notifier bound and can also be used for non
mmu_notifier based page invalidation. mmu_invalidate_* was chosen to
better describe the purpose of 'invalidating' a page that those
variables are used for.

  - mmu_notifier_seq/range_start/range_end are renamed to
    mmu_invalidate_seq/range_start/range_end.

  - mmu_notifier_retry{_hva} helper functions are renamed to
    mmu_invalidate_retry{_hva}.

  - mmu_notifier_count is renamed to mmu_invalidate_in_progress to
    avoid confusion with mn_active_invalidate_count.

  - While here, also update kvm_inc/dec_notifier_count() to
    kvm_mmu_invalidate_begin/end() to match the change for
    mmu_notifier_count.

No functional change intended.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Message-Id: <20220816125322.1110439-3-chao.p.peng@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/mmu.c                     |  8 ++---
 arch/mips/kvm/mmu.c                      | 12 +++----
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_host.c    |  4 +--
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  4 +--
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  6 ++--
 arch/powerpc/kvm/book3s_hv_nested.c      |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  8 ++---
 arch/powerpc/kvm/e500_mmu_host.c         |  4 +--
 arch/riscv/kvm/mmu.c                     |  4 +--
 arch/x86/kvm/mmu/mmu.c                   | 14 ++++----
 arch/x86/kvm/mmu/paging_tmpl.h           |  4 +--
 include/linux/kvm_host.h                 | 60 +++++++++++++++++---------------
 virt/kvm/kvm_main.c                      | 52 ++++++++++++++-------------
 virt/kvm/pfncache.c                      | 17 ++++-----
 15 files changed, 103 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 87f1cd0df36e..c9a13e487187 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -993,7 +993,7 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 * THP doesn't start to split while we are adjusting the
 		 * refcounts.
 		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
+		 * We are sure this doesn't happen, because mmu_invalidate_retry
 		 * was successful and we are holding the mmu_lock, so if this
 		 * THP is trying to split, it will be blocked in the mmu
 		 * notifier before touching any of the pages, specifically
@@ -1188,9 +1188,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			return ret;
 	}
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	/*
-	 * Ensure the read of mmu_notifier_seq happens before we call
+	 * Ensure the read of mmu_invalidate_seq happens before we call
 	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
 	 * the page we just got a reference to gets unmapped before we have a
 	 * chance to grab the mmu_lock, which ensure that if the page gets
@@ -1246,7 +1246,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	else
 		write_lock(&kvm->mmu_lock);
 	pgt = vcpu->arch.hw_mmu->pgt;
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	/*
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index db17e870bdff..74cd64a24d05 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -615,17 +615,17 @@ retry:
 	 * Used to check for invalidations in progress, of the pfn that is
 	 * returned by pfn_to_pfn_prot below.
 	 */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	/*
-	 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
-	 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
+	 * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads
+	 * in gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
 	 * risk the page we get a reference to getting unmapped before we have a
-	 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
+	 * chance to grab the mmu_lock without mmu_invalidate_retry() noticing.
 	 *
 	 * This smp_rmb() pairs with the effective smp_wmb() of the combination
 	 * of the pte_unmap_unlock() after the PTE is zapped, and the
 	 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
-	 * mmu_notifier_seq is incremented.
+	 * mmu_invalidate_seq is incremented.
 	 */
 	smp_rmb();
 
@@ -638,7 +638,7 @@ retry:
 
 	spin_lock(&kvm->mmu_lock);
 	/* Check if an invalidation has taken place since we got pfn */
-	if (mmu_notifier_retry(kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(kvm, mmu_seq)) {
 		/*
 		 * This can happen when mappings are changed asynchronously, but
 		 * also synchronously if a COW is triggered by
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4def2bd17b9b..d49065af08e9 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -666,7 +666,7 @@ static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq,
 	VM_WARN(!spin_is_locked(&kvm->mmu_lock),
 		"%s called with kvm mmu_lock not held \n", __func__);
 
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		return NULL;
 
 	pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 1ae09992c9ea..bc6a381b5346 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -90,7 +90,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	unsigned long pfn;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* Get host physical address for gpa */
@@ -151,7 +151,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	cpte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 	spin_lock(&kvm->mmu_lock);
-	if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) {
+	if (!cpte || mmu_invalidate_retry(kvm, mmu_seq)) {
 		r = -EAGAIN;
 		goto out_unlock;
 	}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 514fd45c1994..e9744b41a226 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -578,7 +578,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 		return -EFAULT;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	ret = -EFAULT;
@@ -693,7 +693,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
 
 	/* Check if we might have been invalidated; let the guest retry if so */
 	ret = RESUME_GUEST;
-	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(vcpu->kvm, mmu_seq)) {
 		unlock_rmap(rmap);
 		goto out_unlock;
 	}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 9d4b3feda3b6..5d5e12f3bf86 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -640,7 +640,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 	/* Check if we might have been invalidated; let the guest retry if so */
 	spin_lock(&kvm->mmu_lock);
 	ret = -EAGAIN;
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	/* Now traverse again under the lock and change the tree */
@@ -830,7 +830,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 	bool large_enable;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/*
@@ -1191,7 +1191,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm,
 	 * Increase the mmu notifier sequence number to prevent any page
 	 * fault that read the memslot earlier from writing a PTE.
 	 */
-	kvm->mmu_notifier_seq++;
+	kvm->mmu_invalidate_seq++;
 	spin_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index be8249cc6107..5a64a1341e6f 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1580,7 +1580,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
 	/* 2. Find the host pte for this L1 guest real address */
 
 	/* Used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* See if can find translation in our partition scoped tables for L1 */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 2257fb18cb72..5a05953ae13f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -219,7 +219,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 	g_ptel = ptel;
 
 	/* used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/* Find the memslot (if any) for this address */
@@ -366,7 +366,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 			rmap = real_vmalloc_addr(rmap);
 		lock_rmap(rmap);
 		/* Check for pending invalidations under the rmap chain lock */
-		if (mmu_notifier_retry(kvm, mmu_seq)) {
+		if (mmu_invalidate_retry(kvm, mmu_seq)) {
 			/* inval in progress, write a non-present HPTE */
 			pteh |= HPTE_V_ABSENT;
 			pteh &= ~HPTE_V_VALID;
@@ -932,7 +932,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
 	int i;
 
 	/* Used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
@@ -960,7 +960,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
 	long ret = H_SUCCESS;
 
 	/* Used later to detect if we might have been invalidated */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 7f16afc331ef..05668e964140 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -339,7 +339,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	unsigned long flags;
 
 	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	/*
@@ -460,7 +460,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 
 	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(kvm, mmu_seq)) {
+	if (mmu_invalidate_retry(kvm, mmu_seq)) {
 		ret = -EAGAIN;
 		goto out;
 	}
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 3a35b2d95697..3620ecac2fa1 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -666,7 +666,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
 		return ret;
 	}
 
-	mmu_seq = kvm->mmu_notifier_seq;
+	mmu_seq = kvm->mmu_invalidate_seq;
 
 	hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable);
 	if (hfn == KVM_PFN_ERR_HWPOISON) {
@@ -686,7 +686,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
 
 	spin_lock(&kvm->mmu_lock);
 
-	if (mmu_notifier_retry(kvm, mmu_seq))
+	if (mmu_invalidate_retry(kvm, mmu_seq))
 		goto out_unlock;
 
 	if (writable) {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eccddb136954..126fa9aec64c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2914,7 +2914,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	 * If addresses are being invalidated, skip prefetching to avoid
 	 * accidentally prefetching those addresses.
 	 */
-	if (unlikely(vcpu->kvm->mmu_notifier_count))
+	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
 		return;
 
 	__direct_pte_prefetch(vcpu, sp, sptep);
@@ -2928,7 +2928,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
  *
  * There are several ways to safely use this helper:
  *
- * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
  *   lookup, but it does need to be held while checking the MMU notifier.
  *
@@ -3056,7 +3056,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		return;
 
 	/*
-	 * mmu_notifier_retry() was successful and mmu_lock is held, so
+	 * mmu_invalidate_retry() was successful and mmu_lock is held, so
 	 * the pmd can't be split from under us.
 	 */
 	fault->goal_level = fault->req_level;
@@ -4203,7 +4203,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
 		return true;
 
 	return fault->slot &&
-	       mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+	       mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
 }
 
 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -4227,7 +4227,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		return r;
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	r = kvm_faultin_pfn(vcpu, fault);
@@ -6055,7 +6055,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 
 	write_lock(&kvm->mmu_lock);
 
-	kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+	kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end);
 
 	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
 
@@ -6069,7 +6069,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
 						   gfn_end - gfn_start);
 
-	kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+	kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end);
 
 	write_unlock(&kvm->mmu_lock);
 }
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f5958071220c..39e0205e7300 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -589,7 +589,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 	 * If addresses are being invalidated, skip prefetching to avoid
 	 * accidentally prefetching those addresses.
 	 */
-	if (unlikely(vcpu->kvm->mmu_notifier_count))
+	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
 		return;
 
 	if (sp->role.direct)
@@ -838,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	else
 		fault->max_level = walker.level;
 
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
+	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	smp_rmb();
 
 	r = kvm_faultin_pfn(vcpu, fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fd5c3b4715f8..f4519d3689e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -765,10 +765,10 @@ struct kvm {
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	struct mmu_notifier mmu_notifier;
-	unsigned long mmu_notifier_seq;
-	long mmu_notifier_count;
-	unsigned long mmu_notifier_range_start;
-	unsigned long mmu_notifier_range_end;
+	unsigned long mmu_invalidate_seq;
+	long mmu_invalidate_in_progress;
+	unsigned long mmu_invalidate_range_start;
+	unsigned long mmu_invalidate_range_end;
 #endif
 	struct list_head devices;
 	u64 manual_dirty_log_protect;
@@ -1357,10 +1357,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 #endif
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end);
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+			      unsigned long end);
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+			    unsigned long end);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
@@ -1907,42 +1907,44 @@ extern const struct kvm_stats_header kvm_vcpu_stats_header;
 extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
-static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
+static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
 {
-	if (unlikely(kvm->mmu_notifier_count))
+	if (unlikely(kvm->mmu_invalidate_in_progress))
 		return 1;
 	/*
-	 * Ensure the read of mmu_notifier_count happens before the read
-	 * of mmu_notifier_seq.  This interacts with the smp_wmb() in
-	 * mmu_notifier_invalidate_range_end to make sure that the caller
-	 * either sees the old (non-zero) value of mmu_notifier_count or
-	 * the new (incremented) value of mmu_notifier_seq.
-	 * PowerPC Book3s HV KVM calls this under a per-page lock
-	 * rather than under kvm->mmu_lock, for scalability, so
-	 * can't rely on kvm->mmu_lock to keep things ordered.
+	 * Ensure the read of mmu_invalidate_in_progress happens before
+	 * the read of mmu_invalidate_seq.  This interacts with the
+	 * smp_wmb() in mmu_notifier_invalidate_range_end to make sure
+	 * that the caller either sees the old (non-zero) value of
+	 * mmu_invalidate_in_progress or the new (incremented) value of
+	 * mmu_invalidate_seq.
+	 *
+	 * PowerPC Book3s HV KVM calls this under a per-page lock rather
+	 * than under kvm->mmu_lock, for scalability, so can't rely on
+	 * kvm->mmu_lock to keep things ordered.
 	 */
 	smp_rmb();
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
 
-static inline int mmu_notifier_retry_hva(struct kvm *kvm,
-					 unsigned long mmu_seq,
-					 unsigned long hva)
+static inline int mmu_invalidate_retry_hva(struct kvm *kvm,
+					   unsigned long mmu_seq,
+					   unsigned long hva)
 {
 	lockdep_assert_held(&kvm->mmu_lock);
 	/*
-	 * If mmu_notifier_count is non-zero, then the range maintained by
-	 * kvm_mmu_notifier_invalidate_range_start contains all addresses that
-	 * might be being invalidated. Note that it may include some false
+	 * If mmu_invalidate_in_progress is non-zero, then the range maintained
+	 * by kvm_mmu_notifier_invalidate_range_start contains all addresses
+	 * that might be being invalidated. Note that it may include some false
 	 * positives, due to shortcuts when handing concurrent invalidations.
 	 */
-	if (unlikely(kvm->mmu_notifier_count) &&
-	    hva >= kvm->mmu_notifier_range_start &&
-	    hva < kvm->mmu_notifier_range_end)
+	if (unlikely(kvm->mmu_invalidate_in_progress) &&
+	    hva >= kvm->mmu_invalidate_range_start &&
+	    hva < kvm->mmu_invalidate_range_end)
 		return 1;
-	if (kvm->mmu_notifier_seq != mmu_seq)
+	if (kvm->mmu_invalidate_seq != mmu_seq)
 		return 1;
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 44b92d773156..5142c054b03c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -702,30 +702,31 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 
 	/*
 	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
-	 * If mmu_notifier_count is zero, then no in-progress invalidations,
-	 * including this one, found a relevant memslot at start(); rechecking
-	 * memslots here is unnecessary.  Note, a false positive (count elevated
-	 * by a different invalidation) is sub-optimal but functionally ok.
+	 * If mmu_invalidate_in_progress is zero, then no in-progress
+	 * invalidations, including this one, found a relevant memslot at
+	 * start(); rechecking memslots here is unnecessary.  Note, a false
+	 * positive (count elevated by a different invalidation) is sub-optimal
+	 * but functionally ok.
 	 */
 	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
-	if (!READ_ONCE(kvm->mmu_notifier_count))
+	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 		return;
 
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end)
+void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
+			      unsigned long end)
 {
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
 	 * count is also read inside the mmu_lock critical section.
 	 */
-	kvm->mmu_notifier_count++;
-	if (likely(kvm->mmu_notifier_count == 1)) {
-		kvm->mmu_notifier_range_start = start;
-		kvm->mmu_notifier_range_end = end;
+	kvm->mmu_invalidate_in_progress++;
+	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = start;
+		kvm->mmu_invalidate_range_end = end;
 	} else {
 		/*
 		 * Fully tracking multiple concurrent ranges has diminishing
@@ -736,10 +737,10 @@ void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 		 * accumulate and persist until all outstanding invalidates
 		 * complete.
 		 */
-		kvm->mmu_notifier_range_start =
-			min(kvm->mmu_notifier_range_start, start);
-		kvm->mmu_notifier_range_end =
-			max(kvm->mmu_notifier_range_end, end);
+		kvm->mmu_invalidate_range_start =
+			min(kvm->mmu_invalidate_range_start, start);
+		kvm->mmu_invalidate_range_end =
+			max(kvm->mmu_invalidate_range_end, end);
 	}
 }
 
@@ -752,7 +753,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.end		= range->end,
 		.pte		= __pte(0),
 		.handler	= kvm_unmap_gfn_range,
-		.on_lock	= kvm_inc_notifier_count,
+		.on_lock	= kvm_mmu_invalidate_begin,
 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
 		.flush_on_ret	= true,
 		.may_block	= mmu_notifier_range_blockable(range),
@@ -763,7 +764,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	/*
 	 * Prevent memslot modification between range_start() and range_end()
 	 * so that conditionally locking provides the same result in both
-	 * functions.  Without that guarantee, the mmu_notifier_count
+	 * functions.  Without that guarantee, the mmu_invalidate_in_progress
 	 * adjustments will be imbalanced.
 	 *
 	 * Pairs with the decrement in range_end().
@@ -779,7 +780,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * any given time, and the caches themselves can check for hva overlap,
 	 * i.e. don't need to rely on memslot overlap checks for performance.
 	 * Because this runs without holding mmu_lock, the pfn caches must use
-	 * mn_active_invalidate_count (see above) instead of mmu_notifier_count.
+	 * mn_active_invalidate_count (see above) instead of
+	 * mmu_invalidate_in_progress.
 	 */
 	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 					  hva_range.may_block);
@@ -789,22 +791,22 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	return 0;
 }
 
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-				   unsigned long end)
+void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start,
+			    unsigned long end)
 {
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
 	 * been freed.
 	 */
-	kvm->mmu_notifier_seq++;
+	kvm->mmu_invalidate_seq++;
 	smp_wmb();
 	/*
 	 * The above sequence increase must be visible before the
 	 * below count decrease, which is ensured by the smp_wmb above
-	 * in conjunction with the smp_rmb in mmu_notifier_retry().
+	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
 	 */
-	kvm->mmu_notifier_count--;
+	kvm->mmu_invalidate_in_progress--;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -816,7 +818,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 		.end		= range->end,
 		.pte		= __pte(0),
 		.handler	= (void *)kvm_null_fn,
-		.on_lock	= kvm_dec_notifier_count,
+		.on_lock	= kvm_mmu_invalidate_end,
 		.on_unlock	= (void *)kvm_null_fn,
 		.flush_on_ret	= false,
 		.may_block	= mmu_notifier_range_blockable(range),
@@ -837,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	if (wake)
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 
-	BUG_ON(kvm->mmu_notifier_count < 0);
+	BUG_ON(kvm->mmu_invalidate_in_progress < 0);
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index ab519f72f2cd..68ff41d39545 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -112,27 +112,28 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
 {
 	/*
 	 * mn_active_invalidate_count acts for all intents and purposes
-	 * like mmu_notifier_count here; but the latter cannot be used
-	 * here because the invalidation of caches in the mmu_notifier
-	 * event occurs _before_ mmu_notifier_count is elevated.
+	 * like mmu_invalidate_in_progress here; but the latter cannot
+	 * be used here because the invalidation of caches in the
+	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
+	 * is elevated.
 	 *
 	 * Note, it does not matter that mn_active_invalidate_count
 	 * is not protected by gpc->lock.  It is guaranteed to
 	 * be elevated before the mmu_notifier acquires gpc->lock, and
-	 * isn't dropped until after mmu_notifier_seq is updated.
+	 * isn't dropped until after mmu_invalidate_seq is updated.
 	 */
 	if (kvm->mn_active_invalidate_count)
 		return true;
 
 	/*
 	 * Ensure mn_active_invalidate_count is read before
-	 * mmu_notifier_seq.  This pairs with the smp_wmb() in
+	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
 	 * mmu_notifier_invalidate_range_end() to guarantee either the
 	 * old (non-zero) value of mn_active_invalidate_count or the
-	 * new (incremented) value of mmu_notifier_seq is observed.
+	 * new (incremented) value of mmu_invalidate_seq is observed.
 	 */
 	smp_rmb();
-	return kvm->mmu_notifier_seq != mmu_seq;
+	return kvm->mmu_invalidate_seq != mmu_seq;
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
@@ -155,7 +156,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	gpc->valid = false;
 
 	do {
-		mmu_seq = kvm->mmu_notifier_seq;
+		mmu_seq = kvm->mmu_invalidate_seq;
 		smp_rmb();
 
 		write_unlock_irq(&gpc->lock);
-- 
cgit v1.2.3


From a357f7b4583ebf81d19c95aef57497ae81c5f63c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Wed, 17 Aug 2022 23:20:08 +0800
Subject: ata: libata: Set __ATA_BASE_SHT max_sectors

Commit 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors
according to shost->max_sectors") inadvertently capped the max_sectors
value for some SATA disks to a value which is lower than we would want.

For a device which supports LBA48, we would previously have request queue
max_sectors_kb and max_hw_sectors_kb values of 1280 and 32767 respectively.

For AHCI controllers, the value chosen for shost max sectors comes from
the minimum of the SCSI host default max sectors in
SCSI_DEFAULT_MAX_SECTORS (1024) and the shost DMA device mapping limit.

This means that we would now set the max_sectors_kb and max_hw_sectors_kb
values for a disk which supports LBA48 at 512, ignoring DMA mapping limit.

As report by Oliver at [0], this caused a performance regression.

Fix by picking a large enough max sectors value for ATA host controllers
such that we don't needlessly reduce max_sectors_kb for LBA48 disks.

[0] https://lore.kernel.org/linux-ide/YvsGbidf3na5FpGb@xsang-OptiPlex-9020/T/#m22d9fc5ad15af66066dd9fecf3d50f1b1ef11da3

Fixes: 0568e6122574 ("ata: libata-scsi: cap ata_device->max_sectors according to shost->max_sectors")
Reported-by: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
---
 include/linux/libata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0269ff114f5a..698032e5ef2d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1382,7 +1382,8 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	.proc_name		= drv_name,			\
 	.slave_destroy		= ata_scsi_slave_destroy,	\
 	.bios_param		= ata_std_bios_param,		\
-	.unlock_native_capacity	= ata_scsi_unlock_native_capacity
+	.unlock_native_capacity	= ata_scsi_unlock_native_capacity,\
+	.max_sectors		= ATA_MAX_SECTORS_LBA48
 
 #define ATA_SUBBASE_SHT(drv_name)				\
 	__ATA_BASE_SHT(drv_name),				\
-- 
cgit v1.2.3


From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Aug 2022 22:56:40 +0200
Subject: mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW

Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know
that FOLL_FORCE can be possibly dangerous, especially if there are races
that can be exploited by user space.

Right now, it would be sufficient to have some code that sets a PTE of a
R/O-mapped shared page dirty, in order for it to erroneously become
writable by FOLL_FORCE.  The implications of setting a write-protected PTE
dirty might not be immediately obvious to everyone.

And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set
pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map
a shmem page R/O while marking the pte dirty.  This can be used by
unprivileged user space to modify tmpfs/shmem file content even if the
user does not have write permissions to the file, and to bypass memfd
write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590).

To fix such security issues for good, the insight is that we really only
need that fancy retry logic (FOLL_COW) for COW mappings that are not
writable (!VM_WRITE).  And in a COW mapping, we really only broke COW if
we have an exclusive anonymous page mapped.  If we have something else
mapped, or the mapped anonymous page might be shared (!PageAnonExclusive),
we have to trigger a write fault to break COW.  If we don't find an
exclusive anonymous page when we retry, we have to trigger COW breaking
once again because something intervened.

Let's move away from this mandatory-retry + dirty handling and rely on our
PageAnonExclusive() flag for making a similar decision, to use the same
COW logic as in other kernel parts here as well.  In case we stumble over
a PTE in a COW mapping that does not map an exclusive anonymous page, COW
was not properly broken and we have to trigger a fake write-fault to break
COW.

Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e
("mm/mprotect: try avoiding write faults for exclusive anonymous pages
when changing protection") and commit 76aefad628aa ("mm/mprotect: fix
soft-dirty check in can_change_pte_writable()"), take care of softdirty
and uffd-wp manually.

For example, a write() via /proc/self/mem to a uffd-wp-protected range has
to fail instead of silently granting write access and bypassing the
userspace fault handler.  Note that FOLL_FORCE is not only used for debug
access, but also triggered by applications without debug intentions, for
example, when pinning pages via RDMA.

This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are
affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR.

Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So
let's just get rid of it.

Thanks to Nadav Amit for pointing out that the pte_dirty() check in
FOLL_FORCE code is problematic and might be exploitable.

Note 1: We don't check for the PTE being dirty because it doesn't matter
	for making a "was COWed" decision anymore, and whoever modifies the
	page has to set the page dirty either way.

Note 2: Kernels before extended uffd-wp support and before
	PageAnonExclusive (< 5.19) can simply revert the problematic
	commit instead and be safe regarding UFFDIO_CONTINUE. A backport to
	v5.19 requires minor adjustments due to lack of
	vma_soft_dirty_enabled().

Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com
Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>	[5.16]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/gup.c           | 68 ++++++++++++++++++++++++++++++++++++------------------
 mm/huge_memory.c   | 64 +++++++++++++++++++++++++++++++++-----------------
 3 files changed, 89 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..982f2607180b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
 #define FOLL_TRIED	0x800	/* a retry, previous pass started an IO */
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
-#define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
 #define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
diff --git a/mm/gup.c b/mm/gup.c
index 732825157430..5abdaf487460 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 	return -EEXIST;
 }
 
-/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+					struct vm_area_struct *vma,
+					unsigned int flags)
 {
-	return pte_write(pte) ||
-		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+	/* If the pte is writable, we can write to the page. */
+	if (pte_write(pte))
+		return true;
+
+	/* Maybe FOLL_FORCE is set to override it? */
+	if (!(flags & FOLL_FORCE))
+		return false;
+
+	/* But FOLL_FORCE has no effect on shared mappings */
+	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		return false;
+
+	/* ... or read-only private ones */
+	if (!(vma->vm_flags & VM_MAYWRITE))
+		return false;
+
+	/* ... or already writable ones that just need to take a write fault */
+	if (vma->vm_flags & VM_WRITE)
+		return false;
+
+	/*
+	 * See can_change_pte_writable(): we broke COW and could map the page
+	 * writable if we have an exclusive anonymous page ...
+	 */
+	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+		return false;
+
+	/* ... and a write-fault isn't required for other reasons. */
+	if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+		return false;
+	return !userfaultfd_pte_wp(vma, pte);
 }
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -528,12 +556,19 @@ retry:
 	}
 	if ((flags & FOLL_NUMA) && pte_protnone(pte))
 		goto no_page;
-	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
-		pte_unmap_unlock(ptep, ptl);
-		return NULL;
-	}
 
 	page = vm_normal_page(vma, address, pte);
+
+	/*
+	 * We only care about anon pages in can_follow_write_pte() and don't
+	 * have to worry about pte_devmap() because they are never anon.
+	 */
+	if ((flags & FOLL_WRITE) &&
+	    !can_follow_write_pte(pte, page, vma, flags)) {
+		page = NULL;
+		goto out;
+	}
+
 	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
 		/*
 		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma,
 		return -EBUSY;
 	}
 
-	/*
-	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
-	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
-	 * can thus safely do subsequent page lookups as if they were reads.
-	 * But only do so when looping for pte_write is futile: in some cases
-	 * userspace may also be wanting to write to the gotten user page,
-	 * which a read fault here might prevent (a readonly page might get
-	 * reCOWed by userspace write).
-	 */
-	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
-		*flags |= FOLL_COW;
 	return 0;
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..e9414ee57c5b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	assert_spin_locked(pmd_lockptr(mm, pmd));
 
-	/*
-	 * When we COW a devmap PMD entry, we split it into PTEs, so we should
-	 * not be in this function with `flags & FOLL_COW` set.
-	 */
-	WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
-
 	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
 	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
 			 (FOLL_PIN | FOLL_GET)))
@@ -1395,14 +1389,42 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+					struct vm_area_struct *vma,
+					unsigned int flags)
 {
-	return pmd_write(pmd) ||
-	       ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+	/* If the pmd is writable, we can write to the page. */
+	if (pmd_write(pmd))
+		return true;
+
+	/* Maybe FOLL_FORCE is set to override it? */
+	if (!(flags & FOLL_FORCE))
+		return false;
+
+	/* But FOLL_FORCE has no effect on shared mappings */
+	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		return false;
+
+	/* ... or read-only private ones */
+	if (!(vma->vm_flags & VM_MAYWRITE))
+		return false;
+
+	/* ... or already writable ones that just need to take a write fault */
+	if (vma->vm_flags & VM_WRITE)
+		return false;
+
+	/*
+	 * See can_change_pte_writable(): we broke COW and could map the page
+	 * writable if we have an exclusive anonymous page ...
+	 */
+	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+		return false;
+
+	/* ... and a write-fault isn't required for other reasons. */
+	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+		return false;
+	return !userfaultfd_huge_pmd_wp(vma, pmd);
 }
 
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned int flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct page *page = NULL;
+	struct page *page;
 
 	assert_spin_locked(pmd_lockptr(mm, pmd));
 
-	if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
-		goto out;
+	page = pmd_page(*pmd);
+	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+	if ((flags & FOLL_WRITE) &&
+	    !can_follow_write_pmd(*pmd, page, vma, flags))
+		return NULL;
 
 	/* Avoid dumping huge zero page */
 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
@@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
 	/* Full NUMA hinting faults to serialise migration in fault paths */
 	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
-		goto out;
-
-	page = pmd_page(*pmd);
-	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+		return NULL;
 
 	if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
 		return ERR_PTR(-EMLINK);
@@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
 
-out:
 	return page;
 }
 
-- 
cgit v1.2.3


From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sun, 7 Aug 2022 15:44:42 +0000
Subject: mm: add DEVICE_ZONE to FOR_ALL_ZONES

FOR_ALL_ZONES should be consistent with enum zone_type.  Otherwise,
__count_zid_vm_events have the potential to add count to wrong item when
zid is ZONE_DEVICE.

Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h | 15 +++++++++++----
 mm/vmstat.c                   |  9 ++++++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 404024486fa5..f3fc36cd2276 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -20,12 +20,19 @@
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##_DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \
+	HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
-		FOR_ALL_ZONES(PGALLOC),
-		FOR_ALL_ZONES(ALLOCSTALL),
-		FOR_ALL_ZONES(PGSCAN_SKIP),
+		FOR_ALL_ZONES(PGALLOC)
+		FOR_ALL_ZONES(ALLOCSTALL)
+		FOR_ALL_ZONES(PGSCAN_SKIP)
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 373d2730fcf2..90af9a8572f5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 #define TEXT_FOR_HIGHMEM(xx)
 #endif
 
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
-					TEXT_FOR_HIGHMEM(xx) xx "_movable",
+					TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+					TEXT_FOR_DEVICE(xx)
 
 const char * const vmstat_text[] = {
 	/* enum zone_stat_item counters */
-- 
cgit v1.2.3


From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 11 Aug 2022 16:13:40 -0400
Subject: mm/uffd: reset write protection when unregister with wp-mode

The motivation of this patch comes from a recent report and patchfix from
David Hildenbrand on hugetlb shared handling of wr-protected page [1].

With the reproducer provided in commit message of [1], one can leverage
the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect
not only the attacker process, but also the whole system.

The lazy-reset mechanism of uffd-wp was used to make unregister faster,
meanwhile it has an assumption that any leftover pgtable entries should
only affect the process on its own, so not only the user should be aware
of anything it does, but also it should not affect outside of the process.

But it seems that this is not true, and it can also be utilized to make
some exploit easier.

So far there's no clue showing that the lazy-reset is important to any
userfaultfd users because normally the unregister will only happen once
for a specific range of memory of the lifecycle of the process.

Considering all above, what this patch proposes is to do explicit pte
resets when unregister an uffd region with wr-protect mode enabled.

It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false)
right before ioctl(UFFDIO_UNREGISTER) for the user.  So potentially it'll
make the unregister slower.  From that pov it's a very slight abi change,
but hopefully nothing should break with this change either.

Regarding to the change itself - core of uffd write [un]protect operation
is moved into a separate function (uffd_wp_range()) and it is reused in
the unregister code path.

Note that the new function will not check for anything, e.g.  ranges or
memory types, because they should have been checked during the previous
UFFDIO_REGISTER or it should have failed already.  It also doesn't check
mmap_changing because we're with mmap write lock held anyway.

I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's
the only issue reported so far and that's the commit David's reproducer
will start working (v5.19+).  But the whole idea actually applies to not
only file memories but also anonymous.  It's just that we don't need to
fix anonymous prior to v5.19- because there's no known way to exploit.

IOW, this patch can also fix the issue reported in [1] as the patch 2 does.

[1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/

Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              |  4 ++++
 include/linux/userfaultfd_k.h |  2 ++
 mm/userfaultfd.c              | 29 ++++++++++++++++++-----------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1c44bf75f916..175de70e3adf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
 		}
 
+		/* Reset ptes for the whole vma range if wr-protected */
+		if (userfaultfd_wp(vma))
+			uffd_wp_range(mm, vma, start, vma_end - start, false);
+
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 732b522bacb7..e1b8a915e9e9 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, atomic_t *mmap_changing);
+extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+			  unsigned long start, unsigned long len, bool enable_wp);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 07d3befc80e4..7327b2573f7c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
 			      mmap_changing, 0);
 }
 
+void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+		   unsigned long start, unsigned long len, bool enable_wp)
+{
+	struct mmu_gather tlb;
+	pgprot_t newprot;
+
+	if (enable_wp)
+		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+	else
+		newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+	tlb_gather_mmu(&tlb, dst_mm);
+	change_protection(&tlb, dst_vma, start, start + len, newprot,
+			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+	tlb_finish_mmu(&tlb);
+}
+
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 			unsigned long len, bool enable_wp,
 			atomic_t *mmap_changing)
 {
 	struct vm_area_struct *dst_vma;
 	unsigned long page_mask;
-	struct mmu_gather tlb;
-	pgprot_t newprot;
 	int err;
 
 	/*
@@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
 			goto out_unlock;
 	}
 
-	if (enable_wp)
-		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
-	else
-		newprot = vm_get_page_prot(dst_vma->vm_flags);
-
-	tlb_gather_mmu(&tlb, dst_mm);
-	change_protection(&tlb, dst_vma, start, start + len, newprot,
-			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
-	tlb_finish_mmu(&tlb);
+	uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
 
 	err = 0;
 out_unlock:
-- 
cgit v1.2.3


From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 10 Aug 2022 21:51:09 -0700
Subject: mm/shmem: fix chattr fsflags support in tmpfs

ext[234] have always allowed unimplemented chattr flags to be set, but
other filesystems have tended to be stricter.  Follow the stricter
approach for tmpfs: I don't want to have to explain why csu attributes
don't actually work, and we won't need to update the chattr(1) manpage;
and it's never wrong to start off strict, relaxing later if persuaded.
Allow only a (append only) i (immutable) A (no atime) and d (no dump).

Although lsattr showed 'A' inherited, the NOATIME behavior was not being
inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME.  Add
shmem_set_inode_flags() to sync the flags, using inode_set_flags() to
avoid that instant of lost immutablility during fileattr_set().

But that change switched generic/079 from passing to failing: because
FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the
INHERITED fsflags: remove them and generic/079 is back to passing.

Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com
Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs")
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Radoslaw Burny <rburny@google.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h | 13 ++++--------
 mm/shmem.c               | 54 +++++++++++++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 1b6c4013f691..ff0b990de83d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -29,15 +29,10 @@ struct shmem_inode_info {
 	struct inode		vfs_inode;
 };
 
-#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
-#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE
-#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_USER_VISIBLE		FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_MODIFIABLE \
+	(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
+#define SHMEM_FL_INHERITED		(FS_NODUMP_FL | FS_NOATIME_FL)
 
 struct shmem_sb_info {
 	unsigned long max_blocks;   /* How many blocks are allowed */
diff --git a/mm/shmem.c b/mm/shmem.c
index 5783f11351bb..170b4078420f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2281,16 +2281,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned shmem_mask_flags(umode_t mode, __u32 flags)
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+
+/*
+ * chattr's fsflags are unrelated to extended attributes,
+ * but tmpfs has chosen to enable them under the same config option.
+ */
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+{
+	unsigned int i_flags = 0;
+
+	if (fsflags & FS_NOATIME_FL)
+		i_flags |= S_NOATIME;
+	if (fsflags & FS_APPEND_FL)
+		i_flags |= S_APPEND;
+	if (fsflags & FS_IMMUTABLE_FL)
+		i_flags |= S_IMMUTABLE;
+	/*
+	 * But FS_NODUMP_FL does not require any action in i_flags.
+	 */
+	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+}
+#else
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
 {
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & SHMEM_REG_FLMASK;
-	else
-		return flags & SHMEM_OTHER_FLMASK;
 }
+#define shmem_initxattrs NULL
+#endif
 
 static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
 				     umode_t mode, dev_t dev, unsigned long flags)
@@ -2319,7 +2337,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
 		info->i_crtime = inode->i_mtime;
 		info->fsflags = (dir == NULL) ? 0 :
 			SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
-		info->fsflags = shmem_mask_flags(mode, info->fsflags);
+		if (info->fsflags)
+			shmem_set_inode_flags(inode, info->fsflags);
 		INIT_LIST_HEAD(&info->shrinklist);
 		INIT_LIST_HEAD(&info->swaplist);
 		simple_xattrs_init(&info->xattrs);
@@ -2468,12 +2487,6 @@ out_unacct_blocks:
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
 
-#ifdef CONFIG_TMPFS_XATTR
-static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
-#else
-#define shmem_initxattrs NULL
-#endif
-
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len,
@@ -3179,18 +3192,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns,
 
 	if (fileattr_has_fsx(fa))
 		return -EOPNOTSUPP;
+	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
+		return -EOPNOTSUPP;
 
 	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
 		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
 
-	inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME);
-	if (info->fsflags & FS_APPEND_FL)
-		inode->i_flags |= S_APPEND;
-	if (info->fsflags & FS_IMMUTABLE_FL)
-		inode->i_flags |= S_IMMUTABLE;
-	if (info->fsflags & FS_NOATIME_FL)
-		inode->i_flags |= S_NOATIME;
-
+	shmem_set_inode_flags(inode, info->fsflags);
 	inode->i_ctime = current_time(inode);
 	return 0;
 }
-- 
cgit v1.2.3


From d59b73a66e5e0682442b6d7b4965364e57078b80 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Wed, 3 Aug 2022 10:49:23 +0300
Subject: net/mlx5: Avoid false positive lockdep warning by adding
 lock_class_key

Add a lock_class_key per mlx5 device to avoid a false positive
"possible circular locking dependency" warning by lockdep, on flows
which lock more than one mlx5 device, such as adding SF.

kernel log:
 ======================================================
 WARNING: possible circular locking dependency detected
 5.19.0-rc8+ #2 Not tainted
 ------------------------------------------------------
 kworker/u20:0/8 is trying to acquire lock:
 ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core]

 but task is already holding lock:
 ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&(&notifier->n_head)->rwsem){++++}-{3:3}:
        down_write+0x90/0x150
        blocking_notifier_chain_register+0x53/0xa0
        mlx5_sf_table_init+0x369/0x4a0 [mlx5_core]
        mlx5_init_one+0x261/0x490 [mlx5_core]
        probe_one+0x430/0x680 [mlx5_core]
        local_pci_probe+0xd6/0x170
        work_for_cpu_fn+0x4e/0xa0
        process_one_work+0x7c2/0x1340
        worker_thread+0x6f6/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}:
        __lock_acquire+0x2fc7/0x6720
        lock_acquire+0x1c1/0x550
        __mutex_lock+0x12c/0x14b0
        mlx5_init_one+0x2e/0x490 [mlx5_core]
        mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
        auxiliary_bus_probe+0x9d/0xe0
        really_probe+0x1e0/0xaa0
        __driver_probe_device+0x219/0x480
        driver_probe_device+0x49/0x130
        __device_attach_driver+0x1b8/0x280
        bus_for_each_drv+0x123/0x1a0
        __device_attach+0x1a3/0x460
        bus_probe_device+0x1a2/0x260
        device_add+0x9b1/0x1b40
        __auxiliary_device_add+0x88/0xc0
        mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
        blocking_notifier_call_chain+0xd5/0x130
        mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
        process_one_work+0x7c2/0x1340
        worker_thread+0x59d/0xec0
        kthread+0x28f/0x330
        ret_from_fork+0x1f/0x30

  other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&(&notifier->n_head)->rwsem);
                                lock(&dev->intf_state_mutex);
                                lock(&(&notifier->n_head)->rwsem);
   lock(&dev->intf_state_mutex);

  *** DEADLOCK ***

 4 locks held by kworker/u20:0/8:
  #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340
  #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340
  #2: ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130
  #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460

 stack backtrace:
 CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x57/0x7d
  check_noncircular+0x278/0x300
  ? print_circular_bug+0x460/0x460
  ? lock_chain_count+0x20/0x20
  ? register_lock_class+0x1880/0x1880
  __lock_acquire+0x2fc7/0x6720
  ? register_lock_class+0x1880/0x1880
  ? register_lock_class+0x1880/0x1880
  lock_acquire+0x1c1/0x550
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  __mutex_lock+0x12c/0x14b0
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? mlx5_init_one+0x2e/0x490 [mlx5_core]
  ? _raw_read_unlock+0x1f/0x30
  ? mutex_lock_io_nested+0x1320/0x1320
  ? __ioremap_caller.constprop.0+0x306/0x490
  ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core]
  ? iounmap+0x160/0x160
  mlx5_init_one+0x2e/0x490 [mlx5_core]
  mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core]
  ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core]
  auxiliary_bus_probe+0x9d/0xe0
  really_probe+0x1e0/0xaa0
  __driver_probe_device+0x219/0x480
  ? auxiliary_match_id+0xe9/0x140
  driver_probe_device+0x49/0x130
  __device_attach_driver+0x1b8/0x280
  ? driver_allows_async_probing+0x140/0x140
  bus_for_each_drv+0x123/0x1a0
  ? bus_for_each_dev+0x1a0/0x1a0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  ? trace_hardirqs_on+0x2d/0x100
  __device_attach+0x1a3/0x460
  ? device_driver_attach+0x1e0/0x1e0
  ? kobject_uevent_env+0x22d/0xf10
  bus_probe_device+0x1a2/0x260
  device_add+0x9b1/0x1b40
  ? dev_set_name+0xab/0xe0
  ? __fw_devlink_link_to_suppliers+0x260/0x260
  ? memset+0x20/0x40
  ? lockdep_init_map_type+0x21a/0x7d0
  __auxiliary_device_add+0x88/0xc0
  ? auxiliary_device_init+0x86/0xa0
  mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core]
  blocking_notifier_call_chain+0xd5/0x130
  mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core]
  ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core]
  ? lock_downgrade+0x6e0/0x6e0
  ? lockdep_hardirqs_on_prepare+0x286/0x400
  process_one_work+0x7c2/0x1340
  ? lockdep_hardirqs_on_prepare+0x400/0x400
  ? pwq_dec_nr_in_flight+0x230/0x230
  ? rwlock_bug.part.0+0x90/0x90
  worker_thread+0x59d/0xec0
  ? process_one_work+0x1340/0x1340
  kthread+0x28f/0x330
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>

Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 ++++
 include/linux/mlx5/driver.h                    | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index bec8d6d0b5f6..c085b031abfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1530,7 +1530,9 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	memcpy(&dev->profile, &profile[profile_idx], sizeof(dev->profile));
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
+	lockdep_register_key(&dev->lock_key);
 	mutex_init(&dev->intf_state_mutex);
+	lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key);
 
 	mutex_init(&priv->bfregs.reg_head.lock);
 	mutex_init(&priv->bfregs.wc_head.lock);
@@ -1597,6 +1599,7 @@ err_timeout_init:
 	mutex_destroy(&priv->bfregs.wc_head.lock);
 	mutex_destroy(&priv->bfregs.reg_head.lock);
 	mutex_destroy(&dev->intf_state_mutex);
+	lockdep_unregister_key(&dev->lock_key);
 	return err;
 }
 
@@ -1618,6 +1621,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 	mutex_destroy(&priv->bfregs.wc_head.lock);
 	mutex_destroy(&priv->bfregs.reg_head.lock);
 	mutex_destroy(&dev->intf_state_mutex);
+	lockdep_unregister_key(&dev->lock_key);
 }
 
 static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 96b16fbe1aa4..7b7ce602c808 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -779,6 +779,7 @@ struct mlx5_core_dev {
 	enum mlx5_device_state	state;
 	/* sync interface state */
 	struct mutex		intf_state_mutex;
+	struct lock_class_key	lock_key;
 	unsigned long		intf_state;
 	struct mlx5_priv	priv;
 	struct mlx5_profile	profile;
-- 
cgit v1.2.3


From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 20 Aug 2022 17:38:37 +0200
Subject: netfilter: ebtables: reject blobs that don't provide all entry points

Harshit Mogalapalli says:
 In ebt_do_table() function dereferencing 'private->hook_entry[hook]'
 can lead to NULL pointer dereference. [..] Kernel panic:

general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
[..]
RIP: 0010:ebt_do_table+0x1dc/0x1ce0
Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88
[..]
Call Trace:
 nf_hook_slow+0xb1/0x170
 __br_forward+0x289/0x730
 maybe_deliver+0x24b/0x380
 br_flood+0xc6/0x390
 br_dev_xmit+0xa2e/0x12c0

For some reason ebtables rejects blobs that provide entry points that are
not supported by the table, but what it should instead reject is the
opposite: blobs that DO NOT provide an entry point supported by the table.

t->valid_hooks is the bitmask of hooks (input, forward ...) that will see
packets.  Providing an entry point that is not support is harmless
(never called/used), but the inverse isn't: it results in a crash
because the ebtables traverser doesn't expect a NULL blob for a location
its receiving packets for.

Instead of fixing all the individual checks, do what iptables is doing and
reject all blobs that differ from the expected hooks.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_bridge/ebtables.h | 4 ----
 net/bridge/netfilter/ebtable_broute.c     | 8 --------
 net/bridge/netfilter/ebtable_filter.c     | 8 --------
 net/bridge/netfilter/ebtable_nat.c        | 8 --------
 net/bridge/netfilter/ebtables.c           | 8 +-------
 5 files changed, 1 insertion(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index a13296d6c7ce..fd533552a062 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -94,10 +94,6 @@ struct ebt_table {
 	struct ebt_replace_kernel *table;
 	unsigned int valid_hooks;
 	rwlock_t lock;
-	/* e.g. could be the table explicitly only allows certain
-	 * matches, targets, ... 0 == let it in */
-	int (*check)(const struct ebt_table_info *info,
-	   unsigned int valid_hooks);
 	/* the data used by the kernel */
 	struct ebt_table_info *private;
 	struct nf_hook_ops *ops;
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 1a11064f9990..8f19253024b0 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -36,18 +36,10 @@ static struct ebt_replace_kernel initial_table = {
 	.entries	= (char *)&initial_chain,
 };
 
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
-	if (valid_hooks & ~(1 << NF_BR_BROUTING))
-		return -EINVAL;
-	return 0;
-}
-
 static const struct ebt_table broute_table = {
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.check		= check,
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index cb949436bc0e..278f324e6752 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = {
 	.entries	= (char *)initial_chains,
 };
 
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
-	if (valid_hooks & ~FILTER_VALID_HOOKS)
-		return -EINVAL;
-	return 0;
-}
-
 static const struct ebt_table frame_filter = {
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.check		= check,
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 5ee0531ae506..9066f7f376d5 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = {
 	.entries	= (char *)initial_chains,
 };
 
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
-	if (valid_hooks & ~NAT_VALID_HOOKS)
-		return -EINVAL;
-	return 0;
-}
-
 static const struct ebt_table frame_nat = {
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.check		= check,
 	.me		= THIS_MODULE,
 };
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f2dbefb61ce8..9a0ae59cdc50 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1040,8 +1040,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 		goto free_iterate;
 	}
 
-	/* the table doesn't like it */
-	if (t->check && (ret = t->check(newinfo, repl->valid_hooks)))
+	if (repl->valid_hooks != t->valid_hooks)
 		goto free_unlock;
 
 	if (repl->num_counters && repl->num_counters != t->private->nentries) {
@@ -1231,11 +1230,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	if (ret != 0)
 		goto free_chainstack;
 
-	if (table->check && table->check(newinfo, table->valid_hooks)) {
-		ret = -EINVAL;
-		goto free_chainstack;
-	}
-
 	table->private = newinfo;
 	rwlock_init(&table->lock);
 	mutex_lock(&ebt_mutex);
-- 
cgit v1.2.3


From af67508ea6cbf0e4ea27f8120056fa2efce127dd Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:56 -0700
Subject: net: Fix data-races around sysctl_fb_tunnels_only_for_init_net.

While reading sysctl_fb_tunnels_only_for_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1a3cb93c3dcc..6d3a33fd0cdb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -640,9 +640,14 @@ extern int sysctl_devconf_inherit_init_net;
  */
 static inline bool net_has_fallback_tunnels(const struct net *net)
 {
-	return !IS_ENABLED(CONFIG_SYSCTL) ||
-	       !sysctl_fb_tunnels_only_for_init_net ||
-	       (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1);
+#if IS_ENABLED(CONFIG_SYSCTL)
+	int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);
+
+	return !fb_tunnels_only_for_init_net ||
+		(net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
+#else
+	return true;
+#endif
 }
 
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
-- 
cgit v1.2.3


From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Tue, 23 Aug 2022 10:46:57 -0700
Subject: net: Fix data-races around sysctl_devconf_inherit_init_net.

While reading sysctl_devconf_inherit_init_net, it can be changed
concurrently.  Thus, we need to add READ_ONCE() to its readers.

Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 +++++++++
 net/ipv4/devinet.c        | 16 ++++++++++------
 net/ipv6/addrconf.c       |  5 ++---
 3 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6d3a33fd0cdb..05d6f3facd5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -650,6 +650,15 @@ static inline bool net_has_fallback_tunnels(const struct net *net)
 #endif
 }
 
+static inline int net_inherit_devconf(void)
+{
+#if IS_ENABLED(CONFIG_SYSCTL)
+	return READ_ONCE(sysctl_devconf_inherit_init_net);
+#else
+	return 0;
+#endif
+}
+
 static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
 {
 #if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 92b778e423df..e8b9a9202fec 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2682,23 +2682,27 @@ static __net_init int devinet_init_net(struct net *net)
 #endif
 
 	if (!net_eq(net, &init_net)) {
-		if (IS_ENABLED(CONFIG_SYSCTL) &&
-		    sysctl_devconf_inherit_init_net == 3) {
+		switch (net_inherit_devconf()) {
+		case 3:
 			/* copy from the current netns */
 			memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
 			       sizeof(ipv4_devconf));
 			memcpy(dflt,
 			       current->nsproxy->net_ns->ipv4.devconf_dflt,
 			       sizeof(ipv4_devconf_dflt));
-		} else if (!IS_ENABLED(CONFIG_SYSCTL) ||
-			   sysctl_devconf_inherit_init_net != 2) {
-			/* inherit == 0 or 1: copy from init_net */
+			break;
+		case 0:
+		case 1:
+			/* copy from init_net */
 			memcpy(all, init_net.ipv4.devconf_all,
 			       sizeof(ipv4_devconf));
 			memcpy(dflt, init_net.ipv4.devconf_dflt,
 			       sizeof(ipv4_devconf_dflt));
+			break;
+		case 2:
+			/* use compiled values */
+			break;
 		}
-		/* else inherit == 2: use compiled values */
 	}
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b624e3d8c5f0..e15f64f22fa8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -7162,9 +7162,8 @@ static int __net_init addrconf_init_net(struct net *net)
 	if (!dflt)
 		goto err_alloc_dflt;
 
-	if (IS_ENABLED(CONFIG_SYSCTL) &&
-	    !net_eq(net, &init_net)) {
-		switch (sysctl_devconf_inherit_init_net) {
+	if (!net_eq(net, &init_net)) {
+		switch (net_inherit_devconf()) {
 		case 1:  /* copy from init_net */
 			memcpy(all, init_net.ipv6.devconf_all,
 			       sizeof(ipv6_devconf));
-- 
cgit v1.2.3


From 2a5840124009f133bd09fd855963551fb2cefe22 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 15 Jul 2022 12:16:22 -0700
Subject: lsm,io_uring: add LSM hooks for the new uring_cmd file op

io-uring cmd support was added through ee692a21e9bf ("fs,io_uring:
add infrastructure for uring-cmd"), this extended the struct
file_operations to allow a new command which each subsystem can use
to enable command passthrough. Add an LSM specific for the command
passthrough which enables LSMs to inspect the command details.

This was discussed long ago without no clear pointer for something
conclusive, so this enables LSMs to at least reject this new file
operation.

[0] https://lkml.kernel.org/r/8adf55db-7bab-f59d-d612-ed906b948d19@schaufler-ca.com

Cc: stable@vger.kernel.org
Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 3 +++
 include/linux/security.h      | 5 +++++
 io_uring/uring_cmd.c          | 5 +++++
 security/security.c           | 4 ++++
 5 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..60fff133c0b1 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -407,4 +407,5 @@ LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
 #ifdef CONFIG_IO_URING
 LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
 LSM_HOOK(int, 0, uring_sqpoll, void)
+LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
 #endif /* CONFIG_IO_URING */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84a0d7e02176..3aa6030302f5 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1582,6 +1582,9 @@
  *      Check whether the current task is allowed to spawn a io_uring polling
  *      thread (IORING_SETUP_SQPOLL).
  *
+ * @uring_cmd:
+ *      Check whether the file_operations uring_cmd is allowed to run.
+ *
  */
 union security_list_options {
 	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..7bd0c490703d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2060,6 +2060,7 @@ static inline int security_perf_event_write(struct perf_event *event)
 #ifdef CONFIG_SECURITY
 extern int security_uring_override_creds(const struct cred *new);
 extern int security_uring_sqpoll(void);
+extern int security_uring_cmd(struct io_uring_cmd *ioucmd);
 #else
 static inline int security_uring_override_creds(const struct cred *new)
 {
@@ -2069,6 +2070,10 @@ static inline int security_uring_sqpoll(void)
 {
 	return 0;
 }
+static inline int security_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_IO_URING */
 
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 8e0cc2d9205e..0f7ad956ddcb 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,6 +3,7 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/io_uring.h>
+#include <linux/security.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -88,6 +89,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	if (!req->file->f_op->uring_cmd)
 		return -EOPNOTSUPP;
 
+	ret = security_uring_cmd(ioucmd);
+	if (ret)
+		return ret;
+
 	if (ctx->flags & IORING_SETUP_SQE128)
 		issue_flags |= IO_URING_F_SQE128;
 	if (ctx->flags & IORING_SETUP_CQE32)
diff --git a/security/security.c b/security/security.c
index 14d30fec8a00..4b95de24bc8d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2660,4 +2660,8 @@ int security_uring_sqpoll(void)
 {
 	return call_int_hook(uring_sqpoll, 0);
 }
+int security_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	return call_int_hook(uring_cmd, 0, ioucmd);
+}
 #endif /* CONFIG_IO_URING */
-- 
cgit v1.2.3


From 8238b4579866b7c1bb99883cfe102a43db5506ff Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 26 Aug 2022 09:17:08 -0400
Subject: wait_on_bit: add an acquire memory barrier

There are several places in the kernel where wait_on_bit is not followed
by a memory barrier (for example, in drivers/md/dm-bufio.c:new_read).

On architectures with weak memory ordering, it may happen that memory
accesses that follow wait_on_bit are reordered before wait_on_bit and
they may return invalid data.

Fix this class of bugs by introducing a new function "test_bit_acquire"
that works like test_bit, but has acquire memory ordering semantics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/atomic_bitops.txt                     | 10 ++++------
 arch/x86/include/asm/bitops.h                       | 21 +++++++++++++++++++++
 include/asm-generic/bitops/generic-non-atomic.h     | 14 ++++++++++++++
 .../asm-generic/bitops/instrumented-non-atomic.h    | 12 ++++++++++++
 include/asm-generic/bitops/non-atomic.h             |  1 +
 .../bitops/non-instrumented-non-atomic.h            |  1 +
 include/linux/bitops.h                              |  1 +
 include/linux/buffer_head.h                         |  2 +-
 include/linux/wait_bit.h                            |  8 ++++----
 kernel/sched/wait_bit.c                             |  2 +-
 10 files changed, 60 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index d8b101c97031..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
 
  - RMW operations that have a return value are fully ordered.
 
- - RMW operations that are conditional are unordered on FAILURE,
-   otherwise the above rules apply. In the case of test_and_set_bit_lock(),
-   if the bit in memory is unchanged by the operation then it is deemed to have
-   failed.
+ - RMW operations that are conditional are fully ordered.
 
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
 
 Since a platform only has a single means of achieving atomic operations
 the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 973c6bd17f98..0fe9de58af31 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -207,6 +207,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
+static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr)
+{
+	bool oldbit;
+
+	asm volatile("testb %2,%1"
+		     CC_SET(nz)
+		     : CC_OUT(nz) (oldbit)
+		     : "m" (((unsigned char *)addr)[nr >> 3]),
+		       "i" (1 << (nr & 7))
+		     :"memory");
+
+	return oldbit;
+}
+
 static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	bool oldbit;
@@ -226,6 +240,13 @@ arch_test_bit(unsigned long nr, const volatile unsigned long *addr)
 					  variable_test_bit(nr, addr);
 }
 
+static __always_inline bool
+arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) :
+					  variable_test_bit(nr, addr);
+}
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
diff --git a/include/asm-generic/bitops/generic-non-atomic.h b/include/asm-generic/bitops/generic-non-atomic.h
index 3d5ebd24652b..564a8c675d85 100644
--- a/include/asm-generic/bitops/generic-non-atomic.h
+++ b/include/asm-generic/bitops/generic-non-atomic.h
@@ -4,6 +4,7 @@
 #define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
 
 #include <linux/bits.h>
+#include <asm/barrier.h>
 
 #ifndef _LINUX_BITOPS_H
 #error only <linux/bitops.h> can be included directly
@@ -127,6 +128,18 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
 }
 
+/**
+ * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
+}
+
 /*
  * const_*() definitions provide good compile-time optimizations when
  * the passed arguments can be resolved at compile time.
@@ -137,6 +150,7 @@ generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
 #define const___test_and_set_bit	generic___test_and_set_bit
 #define const___test_and_clear_bit	generic___test_and_clear_bit
 #define const___test_and_change_bit	generic___test_and_change_bit
+#define const_test_bit_acquire		generic_test_bit_acquire
 
 /**
  * const_test_bit - Determine whether a bit is set
diff --git a/include/asm-generic/bitops/instrumented-non-atomic.h b/include/asm-generic/bitops/instrumented-non-atomic.h
index 988a3bbfba34..2b238b161a62 100644
--- a/include/asm-generic/bitops/instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/instrumented-non-atomic.h
@@ -142,4 +142,16 @@ _test_bit(unsigned long nr, const volatile unsigned long *addr)
 	return arch_test_bit(nr, addr);
 }
 
+/**
+ * _test_bit_acquire - Determine, with acquire semantics, whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline bool
+_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
+{
+	instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
+	return arch_test_bit_acquire(nr, addr);
+}
+
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 5c37ced343ae..71f8d54a5195 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -13,6 +13,7 @@
 #define arch___test_and_change_bit generic___test_and_change_bit
 
 #define arch_test_bit generic_test_bit
+#define arch_test_bit_acquire generic_test_bit_acquire
 
 #include <asm-generic/bitops/non-instrumented-non-atomic.h>
 
diff --git a/include/asm-generic/bitops/non-instrumented-non-atomic.h b/include/asm-generic/bitops/non-instrumented-non-atomic.h
index bdb9b1ffaee9..0ddc78dfc358 100644
--- a/include/asm-generic/bitops/non-instrumented-non-atomic.h
+++ b/include/asm-generic/bitops/non-instrumented-non-atomic.h
@@ -12,5 +12,6 @@
 #define ___test_and_change_bit	arch___test_and_change_bit
 
 #define _test_bit		arch_test_bit
+#define _test_bit_acquire	arch_test_bit_acquire
 
 #endif /* __ASM_GENERIC_BITOPS_NON_INSTRUMENTED_NON_ATOMIC_H */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf9bf65039f2..3b89c64bcfd8 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -59,6 +59,7 @@ extern unsigned long __sw_hweight64(__u64 w);
 #define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
 #define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
 #define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
+#define test_bit_acquire(nr, addr)	bitop(_test_bit_acquire, nr, addr)
 
 /*
  * Include this here because some architectures need generic_ffs/fls in
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index def8b8d30ccc..089c9ade4325 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ static __always_inline int buffer_uptodate(const struct buffer_head *bh)
 	 * make it consistent with folio_test_uptodate
 	 * pairs with smp_mb__before_atomic in set_buffer_uptodate
 	 */
-	return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0;
+	return test_bit_acquire(BH_Uptodate, &bh->b_state);
 }
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 7dec36aecbd9..7725b7579b78 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -71,7 +71,7 @@ static inline int
 wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait,
@@ -96,7 +96,7 @@ static inline int
 wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit,
 				       bit_wait_io,
@@ -123,7 +123,7 @@ wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
 		    unsigned long timeout)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit_timeout(word, bit,
 					       bit_wait_timeout,
@@ -151,7 +151,7 @@ wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
 		   unsigned mode)
 {
 	might_sleep();
-	if (!test_bit(bit, word))
+	if (!test_bit_acquire(bit, word))
 		return 0;
 	return out_of_line_wait_on_bit(word, bit, action, mode);
 }
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index d4788f810b55..0b1cd985dc27 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
 		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
 		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
 			ret = (*action)(&wbq_entry->key, mode);
-	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+	} while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
 
 	finish_wait(wq_head, &wbq_entry->wq_entry);
 
-- 
cgit v1.2.3


From fcab34b433e2c13e333b2f53c4a8409eadc432c7 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 10 Aug 2022 10:53:59 -0600
Subject: mm: re-allow pinning of zero pfns (again)

The below referenced commit makes the same error as 1c563432588d ("mm: fix
is_pinnable_page against a cma page"), re-interpreting the logic to
exclude pinning of the zero page, which breaks device assignment with
vfio.

To avoid further subtle mistakes, split the logic into discrete tests.

[akpm@linux-foundation.org: simplify comment, per John]
Link: https://lkml.kernel.org/r/166015037385.760108.16881097713975517242.stgit@omen
Link: https://lore.kernel.org/all/165490039431.944052.12458624139225785964.stgit@omen
Fixes: f25cbb7a95a2 ("mm: add zone device coherent type memory support")
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Felix Kuehling <felix.kuehling@amd.com>
Tested-by: Slawomir Laba <slawomirx.laba@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Alex Sierra <alex.sierra@amd.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 982f2607180b..21f8b27bd9fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1544,9 +1544,16 @@ static inline bool is_longterm_pinnable_page(struct page *page)
 	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
 		return false;
 #endif
-	return !(is_device_coherent_page(page) ||
-		 is_zone_movable_page(page) ||
-		 is_zero_pfn(page_to_pfn(page)));
+	/* The zero page may always be pinned */
+	if (is_zero_pfn(page_to_pfn(page)))
+		return true;
+
+	/* Coherent device memory must always allow eviction. */
+	if (is_device_coherent_page(page))
+		return false;
+
+	/* Otherwise, non-movable zone pages can be pinned. */
+	return !is_zone_movable_page(page);
 }
 #else
 static inline bool is_longterm_pinnable_page(struct page *page)
-- 
cgit v1.2.3


From dbb16df6443c59e8a1ef21c2272fcf387d600ddf Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Wed, 17 Aug 2022 17:21:39 +0000
Subject: Revert "memcg: cleanup racy sum avoidance code"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 96e51ccf1af33e82f429a0d6baebba29c6448d0f.

Recently we started running the kernel with rstat infrastructure on
production traffic and begin to see negative memcg stats values.
Particularly the 'sock' stat is the one which we observed having negative
value.

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 18446744073708724224

Re-run after couple of seconds

$ grep "sock " /mnt/memory/job/memory.stat
sock 253952
total_sock 53248

For now we are only seeing this issue on large machines (256 CPUs) and
only with 'sock' stat.  I think the networking stack increase the stat on
one cpu and decrease it on another cpu much more often.  So, this negative
sock is due to rstat flusher flushing the stats on the CPU that has seen
the decrement of sock but missed the CPU that has increments.  A typical
race condition.

For easy stable backport, revert is the most simple solution.  For long
term solution, I am thinking of two directions.  First is just reduce the
race window by optimizing the rstat flusher.  Second is if the reader sees
a negative stat value, force flush and restart the stat collection.
Basically retry but limited.

Link: https://lkml.kernel.org/r/20220817172139.3141101-1-shakeelb@google.com
Fixes: 96e51ccf1af33e8 ("memcg: cleanup racy sum avoidance code")
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: "Michal Koutný" <mkoutny@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: <stable@vger.kernel.org>	[5.15]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d31ce55b1c0..6257867fbf95 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -987,19 +987,30 @@ static inline void mod_memcg_page_state(struct page *page,
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	return READ_ONCE(memcg->vmstats.state[idx]);
+	long x = READ_ONCE(memcg->vmstats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
+	long x;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	return READ_ONCE(pn->lruvec_stats.state[idx]);
+	x = READ_ONCE(pn->lruvec_stats.state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
 }
 
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-- 
cgit v1.2.3


From dcf8e5633e2e69ad60b730ab5905608b756a032f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 23 Aug 2022 12:59:25 -0700
Subject: tracing: Define the is_signed_type() macro once

There are two definitions of the is_signed_type() macro: one in
<linux/overflow.h> and a second definition in <linux/trace_events.h>.

As suggested by Linus, move the definition of the is_signed_type() macro
into the <linux/compiler.h> header file.  Change the definition of the
is_signed_type() macro to make sure that it does not trigger any sparse
warnings with future versions of sparse for bitwise types.

Link: https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/
Link: https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h     | 6 ++++++
 include/linux/overflow.h     | 1 -
 include/linux/trace_events.h | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 01ce94b58b42..7713d7bcdaea 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
+ * bool and also pointer types.
+ */
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f1221d11f8e5..0eb3b192f07a 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -30,7 +30,6 @@
  * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
  * credit to Christian Biere.
  */
-#define is_signed_type(type)       (((type)(-1)) < (type)1)
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b18759a673c6..8401dec93c15 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
 int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
 int trace_set_clr_event(const char *system, const char *event, int set);
 int trace_array_set_clr_event(struct trace_array *tr, const char *system,
-- 
cgit v1.2.3